Importing libraries and data

In [1]:
import pandas as pd
import seaborn as sns
import nltk
import numpy as np
import matplotlib.pyplot as plt

Display all columns and rows, adjust image size

In [None]:
pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)

from pylab import rcParams
rcParams['figure.figsize']=12,6
rcParams['figure.dpi']=300

Importing text data

In [3]:
"""
Approach 1: prepare the csv file yourself and import into Python
"""
data=pd.read_csv('Lecture9.csv')
data=pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/HG4054%20Language%20and%20Society%20Through%20Data%20Analytics/Lecture9.csv')

"""
Approach 2: If using an IDE like Spyder or VSCode
Use python to import multiple text files from a directory, and convert to dataframe.
"""
import os

#set the path to the folder containing the text files
folder_path = "/Users/dennistay/desktop/Lecture_samples/" #a mac path
folder_path = "C:/Users/dztay/Desktop/Lecture_samples"    #a windows path

#initialize empty lists to store the file names and contents
file_names = []
file_contents = []

#loop through each file in the folder and read its contents
for file_name in os.listdir(folder_path):
    if file_name.endswith('.txt'):
        with open(os.path.join(folder_path, file_name), 'r', encoding='utf-16') as file:
            file_names.append(file_name)
            file_lines = file.readlines()
            file_text = ''.join(file_lines).replace('\n', ' ') #replaces empty linebreaks with a space
            file_contents.append(file_text)

#create a dataframe from the file names and contents
corpus = pd.DataFrame({'file_name': file_names, 'text': file_contents})

#save to csv if needed
corpus.to_csv('docs.csv')



"""
Approach 3: If using Google Colab, which is a cloud platform, we can't read files directly from the computer.
We need to upload them first.
"""
from google.colab import files

# Upload text files using Google Colab interface
uploaded = files.upload()


# Initialize empty lists to store the file names and contents
file_names = []
file_contents = []

# Loop through each uploaded file and read its contents
for file_name in uploaded.keys():
    if file_name.endswith('.txt'):
        file_text = uploaded[file_name].decode('utf-16').replace('\n', ' ')
        file_names.append(file_name)
        file_contents.append(file_text)

# Create a dataframe from the file names and contents
corpus = pd.DataFrame({'file_name': file_names, 'text': file_contents})

# Save to csv if needed
corpus.to_csv('docs.csv')



Text cleaning


In [68]:
"""
Using Regex and NLTK
"""
import re
from nltk.stem import *
p_stemmer = PorterStemmer()

data.replace('Hong Kong','HK',regex=True,inplace=True)
data.replace('HongKong','HK',regex=True,inplace=True)
data.replace('Hongkong','HK',regex=True,inplace=True)

# Remove punctuation, special characters
data['special_removed']=data['Headline'].map(lambda x: re.sub(r'\W', ' ', x))
# Remove all single characters (e.g. s left behind after deleting aposthrophe)
data['singlechar_removed']=data['special_removed'].map(lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x))
# Substitute multiple spaces with single space (after removing single characters, double spaces are created)
data['singlechar_removed2']=data['singlechar_removed'].map(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
# Remove prefixed 'b' (if text string is in bytes format, a character b is appended with the string. This removes it)
data['b_removed']=data['singlechar_removed2'].map(lambda x: re.sub(r'^b\s+', ' ', x, flags=re.I))
# Convert the titles to lowercase
data['lower_case'] = data['b_removed'].map(lambda x: x.lower())
# Remove numbers (but not numbers within words)
data['num_removed'] = data['lower_case'].map(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
# Stemming to remove morphological affixes from words, leaving only the word stem
data['stemmed'] = data['num_removed'].map(lambda x: p_stemmer.stem(x))
# Finally, create final cleaned column as 'processed'
data['processed']=data['stemmed']



"""
Using TEXTHERO (https://texthero.org/)
"""
pip install texthero
import texthero as hero

# if you have problems installing, try this
pip install "gensim==4.2.0"
pip install "texthero==1.0.5"

#import our data again to more easily compare the outcomes
data2=pd.read_csv('Lecture9.csv')
data2=pd.read_csv('https://raw.githubusercontent.com/dennistay1981/Resources/refs/heads/main/HG4054%20Language%20and%20Society%20Through%20Data%20Analytics/Lecture9.csv')


data2['Headline'] = hero.clean(data2['Headline'])


TF-IDF Vectorization

In [25]:
from sklearn.feature_extraction.text import TfidfVectorizer

#apply tfidf vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,1))  #process up to n-grams (contiguous sequence of n words)
vectorizer.fit_transform(data['processed'])

#see the list of words/features
vectorizer.get_feature_names_out()

#get document-term matrix. This is a 'dense matrix' because every element (including the many 0) is stored
matrix=(vectorizer.fit_transform(data['processed']).toarray())

#x documents, y unique words/features
matrix.shape

#convert matrix to dataframe, with each feature and its corresponding tfidf score
df=pd.DataFrame(matrix, columns=vectorizer.get_feature_names_out())

#convert to csv if needed
df.to_csv('df.csv')

Visualizing outcomes by reducing to 2D

In [28]:
#PCA: reduce matrix to 2D if needed
from sklearn.decomposition import PCA as sklearnPCA
pca = sklearnPCA(n_components=2)
pca.fit_transform(matrix)

#view the linear combinations
pca.components_

#attach reduced 2D back to dataframe for future use
data[['Dim1','Dim2']]=pca.fit_transform(matrix)

sns.scatterplot(data,x='Dim1',y='Dim2', hue='Source')


WORD EMBEDDING with large pre-trained models

In [None]:
#Install and import GENSIM
!pip install --upgrade gensim
import gensim.downloader as api

#See list of available pre-trained models. Larger ones take longer to download.
print(api.info()['models'].keys())

# Load Google News model (300 dimensions)
model = api.load("word2vec-google-news-300")

# Load glove-wiki-gigaword-50 (50 dimensions) (https://nlp.stanford.edu/projects/glove/)
model = api.load("glove-wiki-gigaword-50")



"""
Demonstrating word embedding features
"""
#displaying the vector for a certain word
model['dog']
model['not_a_word']

#vector algebra
#finding most similar words by specifying relations
model.most_similar(positive=['woman', 'king'], negative=['male'])
model.doesnt_match("breakfast cereal dinner lunch".split())
#calculating similarity index between word pairs
model.similarity('woman', 'man')
model.similarity('woman', 'literature')
model.similarity('man', 'literature')
model.similarity('woman', 'engineer')
model.similarity('man', 'engineer')



"""
Derive word embeddings for our data
"""
nltk.download('punkt')

text_column = data['processed']
# Convert the text to a list of sentences
text_data = []
for text in text_column:
    sentence_list = nltk.sent_tokenize(text)
    text_data.extend(sentence_list)
# Preprocess the text data
preprocessed_data = []
for sentence in text_data:
    preprocessed_sentence = [word.lower() for word in sentence.split() if word.isalpha()]
    preprocessed_data.append(preprocessed_sentence)


# Derive embeddings
embedding_data = []
for sentence in preprocessed_data:
    sentence_embedding = [model.get_vector(word) for word in sentence if word in model.key_to_index]
    if sentence_embedding:
        embedding_data.append(sum(sentence_embedding) / len(sentence_embedding))
    else:
        embedding_data.append(None)

#shape of embedding data (227 sentences x 50 or 300 dimensions)
np.array(embedding_data).shape


"""
Convert embeddings to a dataframe.
Each row of the DataFrame corresponds to a sentence in the preprocessed data, and each column corresponds to a dimension of the Word2Vec embeddings.
"""
#Automatically name columns in sequence
embedding = pd.DataFrame(embedding_data, columns=['Dim{}'.format(i) for i in range(1, np.array(embedding_data).shape[1]+ 1)])


#reduce embedding to 2D with PCA, if needed
from sklearn.decomposition import PCA as sklearnPCA
pca = sklearnPCA(n_components=2)
pca.fit_transform(embedding)


#attach reduced 2D back to dataframe, for future use
data3=pd.read_csv('Lecture9.csv')
data3[['Dim1','Dim2']]=pca.fit_transform(embedding)



SEMINAR 9

In [None]:
pip install python-docx

from google.colab import drive
from google.colab import files
from docx import Document
import pandas as pd
import os
import io

"""
First, download all the Seminar9 doc files.
Then create a new folder called 'Seminar9' in your Google drive/Colab Notebooks folder
Then upload all the Seminar9 doc files into this folder

We need to do this because Google Colab is a cloud platform, so we can't read files directly
from our computer. If you are using an IDE like Spyder or VSCode, you can read the files directly using the
code below.
"""
# Mount your Google Drive to access files
drive.mount('/content/drive')

# Replace 'path/to/your/files' with the actual path to your files in Google Drive
file_path = '/content/drive/MyDrive/Colab Notebooks/Seminar9'

file_names = []
file_contents = []

# Loop through each file in the directory and read its contents
for file_name in os.listdir(file_path):
    if file_name.endswith('.docx'):
        # Construct the full file path
        full_path = os.path.join(file_path, file_name)
        # Read the .docx file content
        try:
            with open(full_path, 'rb') as f:
                doc = Document(io.BytesIO(f.read()))  # Using BytesIO
                file_names.append(file_name)
                file_text = ' '.join([paragraph.text for paragraph in doc.paragraphs])
                file_contents.append(file_text)
        except Exception as e:
            print(f"Error processing file {file_name}: {e}")



"""
If not using Google colab, use the code below to import files directly from your hard drive
"""
#set the path to the folder containing the doc files
folder_path = "/Users/dennistay/desktop/Seminar_samples/" #a mac path
folder_path = "C:/Users/dztay/Desktop/Seminar_samples"    #a windows path

file_names = []
file_contents = []
# Loop through each file in the folder and read its contents
for file_name in os.listdir(folder_path):
    if file_name.endswith('.docx'):
        file_path = os.path.join(folder_path, file_name)
        doc = Document(file_path)
        file_names.append(file_name)
        file_text = ' '.join([paragraph.text for paragraph in doc.paragraphs])
        file_contents.append(file_text)





# Create a DataFrame from the file names and contents
corpus = pd.DataFrame({'file_name': file_names, 'text': file_contents})

# Display the DataFrame
print(corpus)


#create a new column 'source' to label whether each transcript is from the CPC or WHO, based on the filename.
#If the filename begins with 'PR', it's from CPC
corpus['source'] = corpus['file_name'].apply(lambda x:'CMFA' if pd.Series(x).str.contains(r'^PR.*$').any() else 'WHO')


#use texthero to clean the text. If texthero does not work for you, skip this step
corpus['text'] = hero.clean(corpus['text'])


#apply tfidf vectorizer to transcripts
vectorizer = TfidfVectorizer(ngram_range=(1,1))
matrix=(vectorizer.fit_transform(corpus['text']).toarray())


#reduce matrix to 2D with PCA
pca = sklearnPCA(n_components=2)

corpus[['Dim1','Dim2']]=pca.fit_transform(matrix)

#visualize the difference between the two sources
sns.scatterplot(corpus, x='Dim1', y='Dim2', hue='source')