<a href="https://colab.research.google.com/github/dennistay1981/Resources/blob/main/Lecture10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Importing libraries and data

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


data=pd.read_csv('Lecture10.csv')

Display all columns and rows, adjust image size

In [None]:
from pylab import rcParams
rcParams['figure.figsize']=8,4
rcParams['figure.dpi']=100

pd.set_option('display.max_rows',None)
pd.set_option('display.max_columns',None)
pd.set_option('display.width', 1000)


Explore dataset with(out) source


*   scatterplot
*   most frequent words



In [None]:
sns.scatterplot(data=data,x='Dim1',y='Dim2')
plt.title('Scatterplot of headlines (without source info)')


sns.scatterplot(data=data,x='Dim1',y='Dim2',hue='Source')
plt.title('Scatterplot of headlines by source')



#USING TEXTHERO: N most frequent words overall
import texthero as hero

N = 10
hero.top_words(data['Headline'])[:N]

#save list to dataframe and create a bar plot
topwords = pd.DataFrame(hero.top_words(data['Headline'])[:N])

sns.barplot(topwords, x=topwords.index, y='Headline')
plt.title('Top 10 words overall')
plt.xticks(rotation=60)

#most frequent word in SCMP
hero.top_words(data.loc[(data.Source =='SCMP')]['Headline'])[:N]
scmp_top = pd.DataFrame(hero.top_words(data.loc[(data.Source =='SCMP')]['Headline'])[:N])

sns.barplot(data=scmp_top, x=scmp_top.index, y='Headline')
plt.title('Top 10 words (SCMP)')
plt.xticks(rotation=60)


#most frequent word in CD
hero.top_words(data.loc[(data.Source =='CD')]['Headline'])[:N]
CD_top = pd.DataFrame(hero.top_words(data.loc[(data.Source =='CD')]['Headline'])[:N])

sns.barplot(data=CD_top, x=CD_top.index, y='Headline')
plt.title('Top 10 words (CD)')
plt.xticks(rotation=60)


#most frequent word in HKFP
hero.top_words(data.loc[(data.Source =='HKFP')]['Headline'])[:N]
HKFP_top = pd.DataFrame(hero.top_words(data.loc[(data.Source =='HKFP')]['Headline'])[:N])

sns.barplot(data=HKFP_top, x=HKFP_top.index, y='Headline')
plt.title('Top 10 words (HKFP)')
plt.xticks(rotation=60)




#USING SCIKIT-LEARN: top words by tfidf-scores.
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,1))
array= vectorizer.fit_transform(data['Headline']).toarray()
# Sum the TF-IDF values for each feature across all documents
total_tfidf_scores = array.sum(axis=0)
# Get the feature names
feature_names =vectorizer.get_feature_names_out()
# Create a dictionary of feature names and their total TF-IDF scores
features_dict = dict(zip(feature_names, total_tfidf_scores))
# Sort the features by their total TF-IDF score in descending order
sorted_features = sorted(features_dict.items(), key=lambda x: x[1], reverse=True)
# Print the top 10 most frequent features
for feature, score in sorted_features[:10]:
    print(f"{feature}: {score}")


#plot top words by scores
topwords = pd.DataFrame(sorted_features[:10], columns=['word','score'])
sns.barplot(topwords, x='word',y='score')


Classification

In [None]:
from sklearn.neighbors import KNeighborsClassifier


# Define y and x
y = data['Source']
x = data.iloc[:,5:]  #D1 to D300 are from column 5 onwards

# Perform train-test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42, stratify=y)


# Find best k
neighbors = np.arange(1, 16)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))
average_accuracy = np.empty(len(neighbors))

# Loop over different values of k, fit model, and compute accuracy
for i, k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors=k)
    knn.fit(x_train,y_train)
    train_accuracy[i] = knn.score(x_train,y_train)
    test_accuracy[i] = knn.score(x_test,y_test)
    average_accuracy[i] = (train_accuracy[i] + test_accuracy[i]) / 2

# Generate plot
plt.title('k-NN: Varying Number of Neighbors')
plt.plot(neighbors, train_accuracy, label='train accuracy')
plt.plot(neighbors, test_accuracy, label='test accuracy')
plt.plot(neighbors, average_accuracy, label='avg accuracy')
plt.xticks(neighbors)
plt.xlabel('Number of Neighbors')
plt.ylabel('Accuracy')
plt.legend()
plt.show()


# Fit the classifier
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(x,y)

# Evaluate accuracy
knn.score(x, y)

# Confusion matrix
from sklearn import metrics

cnf_matrix = metrics.confusion_matrix(data['Source'], knn.predict(x))
cnf_matrix

labels = data['Source'].unique()

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Blues", yticklabels=labels, xticklabels=labels, annot_kws={"size": 25})
plt.ylabel('Actual')
plt.xlabel('Predicted')


Classification report

In [None]:
print(metrics.classification_report(y, knn.predict(x)))  #actual labels, followed by predicted labels

Ensemble learning

In [None]:
# We will use these three classifiers
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

# Instantiate them.
# Classifiers have optimal parameters that should also be independently determined, to optimize the ensemble.
# But we are skipping this step.
knn = KNeighborsClassifier()
lr = LogisticRegression()
svc = SVC()


# Decision trees and Naive bayes are another two common classifiers. We leave them out for now
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()


# Define our list of three classifiers.
classifiers = [('K Nearest Neighbours',knn), ('Logistic Regression',lr), ('SVC',svc)]

# Iterate over the pre-defined list of classifiers, and evaluate predictions
for clf_name, clf in classifiers:
    clf.fit(x, y)
    print(clf_name,':', clf.score(x,y))


# Use a VOTING CLASSIFIER to determine final result
from sklearn.ensemble import VotingClassifier
# Instantiate voting classifier
vc = VotingClassifier(estimators=classifiers)
vc.fit(x, y)

print('Voting Classifier accuracy:', vc.score(x,y))



# Confusion matrix
cnf_matrix = metrics.confusion_matrix(y, vc.predict(x))

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Blues", yticklabels=labels, xticklabels=labels, annot_kws={"size": 25})
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Classification report
print(metrics.classification_report(y, vc.predict(x)))




SEMINAR 10

In [None]:
data=pd.read_csv('Seminar10.csv')
data.head()  #examine the first 5 rows

# 50000 rows may be too much to process on older computers with lower memory
# we therefore randomly sample just 10% of the data, and make sure we STRATIFY this sample by sentiment category
from sklearn.model_selection import train_test_split

# Split the DataFrame into training and testing sets, stratified by the label column
# Remember to standardize random_state so everyone will have the same sample
data_90, data = train_test_split(data, test_size=0.1, stratify=data['sentiment'], random_state=42)

# check the new data size and split of sentiments
len(data)
data.groupby('sentiment').count()
#reset the index for better organization
data=data.reset_index(drop=True)




"""
TEXT CLEANING
"""
import re
from nltk.stem import *
p_stemmer = PorterStemmer()

# Remove punctuation, special characters
data['special_removed']=data['review'].map(lambda x: re.sub(r'\W', ' ', x))
# Remove all single characters (e.g. s left behind after deleting aposthrophe)
data['singlechar_removed']=data['special_removed'].map(lambda x: re.sub(r'\s+[a-zA-Z]\s+', ' ', x))
# Substitute multiple spaces with single space (after removing single characters, double spaces are created)
data['singlechar_removed2']=data['singlechar_removed'].map(lambda x: re.sub(r'\s+', ' ', x, flags=re.I))
# Remove prefixed 'b' (if text string is in bytes format, a character b is appended with the string. This removes it)
data['b_removed']=data['singlechar_removed2'].map(lambda x: re.sub(r'^b\s+', ' ', x, flags=re.I))
# Convert the titles to lowercase
data['lower_case'] = data['b_removed'].map(lambda x: x.lower())
# Remove numbers (but not numbers within words)
data['num_removed'] = data['lower_case'].map(lambda x: re.sub("^\d+\s|\s\d+\s|\s\d+$", " ", x))
# Stemming to remove morphological affixes from words, leaving only the word stem
data['stemmed'] = data['num_removed'].map(lambda x: p_stemmer.stem(x))
# Finally, create final cleaned column as 'processed'
data['processed']=data['stemmed']


"""
TF-IDF VECTORIZATION
"""
from sklearn.feature_extraction.text import TfidfVectorizer

#apply tfidf vectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,1))  #process up to n-grams (contiguous sequence of n words)
vectorizer.fit_transform(data['processed'])

#convert document-term matrix (sparse matrix) to dense matrix (similar to data['tfidf'] above)
array=(vectorizer.fit_transform(data['processed']).toarray())


#reduce array to 2D for visualization
from sklearn.decomposition import PCA as sklearnPCA
pca = sklearnPCA(n_components=2)
data[['Dim1','Dim2']]=pca.fit_transform(array)  #attach 2D back to dataset

#plot scatterplot
sns.scatterplot(data=data,x='Dim1',y='Dim2',hue='sentiment')


# Define outcome label and predictors. Remember we are using the uncompressed tfidf array.
# You can try x=data['Dim1','Dim2'] if you want
y = data['sentiment']
x = array


"""
ENSEMBLE LEARNING
"""
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB


# Try default parameters for classifiers
knn = KNeighborsClassifier()
lr = LogisticRegression()
nb = MultinomialNB()

# Define our list of three classifiers.
classifiers = [('K Nearest Neighbours',knn), ('Logistic Regression',lr), ('Naive Bayes', nb)]

# Iterate over the pre-defined list of classifiers, and evaluate predictions
for clf_name, clf in classifiers:
    clf.fit(x, y)
    print(clf_name,':', clf.score(x,y))


# Use a VOTING CLASSIFIER to determine final result
from sklearn.ensemble import VotingClassifier
# Instantiate voting classifier
vc = VotingClassifier(estimators=classifiers)
vc.fit(x, y)

print('Voting Classifier accuracy:', vc.score(x,y))



# Confusion matrix
cnf_matrix = metrics.confusion_matrix(data['sentiment'], vc.predict(x))
cnf_matrix

labels = data.groupby('sentiment').count().index

# Heatmap. Setting fmt=".0f" will display the full number instead of scientific notation
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Blues", yticklabels=labels ,xticklabels= labels, annot_kws={"size": 25},fmt=".0f")
plt.ylabel('Actual')
plt.xlabel('Predicted')

# Classification Report
print(metrics.classification_report(y, vc.predict(x)))





"""
Try refitting the voting classifier model with your own review and predicting its sentiment!
"""

# use loc to add your 'review' to the end of the dataframe
data.loc[len(data),'processed'] = "do not spend a single cent of your hard earned money on this pathetic excuse for a so called movie"

# check that it's there
data['processed']


# what about this 'review'?
data.loc[len(data),'processed'] = "the most brilliant thing I have ever seen, everything was choreographed to perfection, what a fantastic treat"

new_array=(vectorizer.fit_transform(data['processed']).toarray())
vc.predict(new_array)[-1]


# recalculate the tfidf matrix, and predict the label of the final row (i.e. your review!)
new_array=(vectorizer.fit_transform(data['processed']).toarray())
vc.predict(new_array)[-1]

