<a href="https://colab.research.google.com/github/dennistay1981/Resources/blob/main/Quant_schemes.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libraries and data

In [1]:
import pandas as pd
import seaborn as sns
import nltk
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)

from pylab import rcParams
rcParams['figure.figsize']=12,6
rcParams['figure.dpi']=300

Importing text data

In [5]:
data=pd.read_csv('Tinder_tfidf(2D).csv')
data=pd.read_csv('Tinder.csv')

#cleaning
import re
from nltk.stem import *
p_stemmer = PorterStemmer()

# Remove punctuation, special characters
data['special_removed']=data['Post'].map(lambda x: re.sub(r'\W', ' ', x))
# Remove all single characters (e.g. s left behind after deleting aposthrophe)
data['singlechar_removed']=data['special_removed'].map(lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x))
# Substitute multiple spaces with single space (after removing single characters, double spaces are created)
data['singlechar_removed2']=data['singlechar_removed'].map(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
# Remove prefixed 'b' (if text string is in bytes format, a character b is appended with the string. This removes it)
data['b_removed']=data['singlechar_removed2'].map(lambda x: re.sub(r'^b\s+', ' ', x, flags=re.I))
# Convert the titles to lowercase
data['lower_case'] = data['b_removed'].map(lambda x: x.lower())
# Remove numbers (but not numbers within words)
data['num_removed'] = data['lower_case'].map(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
# Stemming to remove morphological affixes from words, leaving only the word stem
data['stemmed'] = data['num_removed'].map(lambda x: p_stemmer.stem(x))
# Finally, create final cleaned column as 'processed'
data['processed']=data['stemmed']

TF-IDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

#apply tfidf vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,1))  #process up to n-grams (contiguous sequence of n words)
vectorizer.fit_transform(data['processed'])

#see the list of words/features
vectorizer.get_feature_names_out()

#get document-term matrix. This is a 'dense matrix' because every element (including the many 0) is stored
matrix=(vectorizer.fit_transform(data['processed']).toarray())

#x documents, y unique words/features
matrix.shape

#convert matrix to dataframe, with each feature and its corresponding tfidf score
df=pd.DataFrame(matrix, columns=vectorizer.get_feature_names_out())

#convert to csv if needed
df.to_csv('df.csv')

In [None]:
result_df = pd.concat([data, df], axis=1)


result_df.to_csv('IMDB tfidf.csv')

Visualizing outcomes by reducing to 2D

In [None]:
#PCA: reduce matrix to 2D if needed
from sklearn.decomposition import PCA as sklearnPCA
pca = sklearnPCA(n_components=2)
pca.fit_transform(matrix)

#view the linear combinations
pca.components_

#attach reduced 2D back to dataframe for future use
data[['Dim1','Dim2']]=pca.fit_transform(matrix)

sns.scatterplot(data,x='Dim1',y='Dim2', hue='Gender')


WORD EMBEDDING with large pre-trained models

In [17]:
#Install and import GENSIM
!pip install --upgrade gensim
import gensim.downloader as api

#See list of available pre-trained models. Larger ones take longer to download.
print(api.info()['models'].keys())

# Load Google News model (300 dimensions)
model = api.load("word2vec-google-news-300")

# Load glove-wiki-gigaword-50 (50 dimensions) (https://nlp.stanford.edu/projects/glove/)
model = api.load("glove-wiki-gigaword-50")



"""
Demonstrating word embedding features
"""
#displaying the vector for a certain word
model['dog']
model['not_a_word']

#vector algebra
#finding most similar words by specifying relations
model.most_similar(positive=['woman', 'king'], negative=['male'])
model.doesnt_match("breakfast cereal dinner lunch".split())
#calculating similarity index between word pairs
model.similarity('woman', 'man')
model.similarity('woman', 'literature')
model.similarity('man', 'literature')
model.similarity('woman', 'engineer')
model.similarity('man', 'engineer')



"""
Derive word embeddings for our data
"""
nltk.download('punkt')
nltk.download('punkt_tab')

text_column = data['processed']
# Convert the text to a list of sentences
text_data = []
for text in text_column:
    sentence_list = nltk.sent_tokenize(text)
    text_data.extend(sentence_list)
# Preprocess the text data
preprocessed_data = []
for sentence in text_data:
    preprocessed_sentence = [word.lower() for word in sentence.split() if word.isalpha()]
    preprocessed_data.append(preprocessed_sentence)


# Derive embeddings
embedding_data = []
for sentence in preprocessed_data:
    sentence_embedding = [model.get_vector(word) for word in sentence if word in model.key_to_index]
    if sentence_embedding:
        embedding_data.append(sum(sentence_embedding) / len(sentence_embedding))
    else:
        embedding_data.append(None)

#shape of embedding data (no. of sentences x 50 or 300 dimensions)
np.array(embedding_data).shape


"""
Convert embeddings to a dataframe.
Each row of the DataFrame corresponds to a sentence in the preprocessed data, and each column corresponds to a dimension of the Word2Vec embeddings.
"""
#Automatically name columns in sequence
embedding = pd.DataFrame(embedding_data, columns=['Dim{}'.format(i) for i in range(1, np.array(embedding_data).shape[1]+ 1)])


#reduce embedding to 2D with PCA, if needed
from sklearn.decomposition import PCA as sklearnPCA
pca = sklearnPCA(n_components=2)
pca.fit_transform(embedding)


#attach reduced 2D back to dataframe, for future use
data3=pd.read_csv('Lecture9.csv')
data3[['Dim1','Dim2']]=pca.fit_transform(embedding)



(100, 300)

Ensemble model fitting

In [None]:
x=data[['Dim1','Dim2']]
y=data['Gender']


# Perform train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)


# We will use these three classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Instantiate them.
# Classifiers have optimal parameters that should also be independently determined, to optimize the ensemble.
# But we are skipping this step.
knn = KNeighborsClassifier()
lr = LogisticRegression()
svc = SVC()

# Decision trees and Naive bayes are another two common classifiers. We leave them out for now
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()


# Define our list of three classifiers.
classifiers = [('K Nearest Neighbours',knn), ('Logistic Regression',lr), ('SVC',svc)]

# Iterate over the pre-defined list of classifiers, and evaluate predictions
for clf_name, clf in classifiers:
    clf.fit(x_train, y_train)
    print(clf_name,'Train accuracy:', clf.score(x_train,y_train), 'Test accuracy:', clf.score(x_test, y_test))




# Use a VOTING CLASSIFIER to determine final result
from sklearn.ensemble import VotingClassifier
# Instantiate voting classifier
vc = VotingClassifier(estimators=classifiers)
vc.fit(x_train, y_train)

print('Voting Classifier train accuracy:', vc.score(x_train,y_train), 'Test accuracy:', vc.score(x_test,y_test))






# Confusion matrix
cnf_matrix = metrics.confusion_matrix(y, vc.predict(x))

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Blues", yticklabels=labels, xticklabels=labels, annot_kws={"size": 25})
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Classification report
print(metrics.classification_report(y, vc.predict(x)))

Voting Classifier train accuracy: 0.5625 Test accuracy: 0.55
