## Elle Nguyen - CIS 3715 - Section 02
### Lab 09 - Document Analysis

Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics.cluster import normalized_mutual_info_score

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

### Task 2.1 - Load data and represent it with TF-IDF representation

Load data

In [2]:
df = pd.read_csv('BBC_News_Train.csv')

In [3]:
df.head()

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ArticleId  1490 non-null   int64 
 1   Text       1490 non-null   object
 2   Category   1490 non-null   object
dtypes: int64(1), object(2)
memory usage: 35.0+ KB


Split into 2 subsets of training and testing set

In [5]:
df_train, df_test = train_test_split(df, test_size=0.15, random_state=42)

In [6]:
f'Train data target names: {df_train["Category"].unique()}'

"Train data target names: ['entertainment' 'tech' 'business' 'politics' 'sport']"

In [7]:
f'#training samples: {len(df_train)}'

'#training samples: 1266'

In [8]:
f'#testing samples: {len(df_test)}'

'#testing samples: 224'

Represent data with TF-IDF representation

In [9]:
vectorizer = TfidfVectorizer(stop_words='english')
df_train_vectors = vectorizer.fit_transform(df_train["Text"])
df_test_vectors = vectorizer.transform(df_test["Text"]) 

In [10]:
f'{df_train_vectors.shape}, {df_test_vectors.shape}'

'(1266, 22864), (224, 22864)'

### Task 2.2 - Use KNN to do document classification

In [11]:
Xtr = df_train_vectors
Ytr = df_train['Category']

Xte = df_test_vectors
Yte = df_test['Category']

k_range = range(1, 5)
param_grid = dict(n_neighbors=k_range)

clf_knn =  KNeighborsClassifier(n_neighbors=1)

grid = GridSearchCV(clf_knn, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

GridSearchCV(cv=5, estimator=KNeighborsClassifier(n_neighbors=1),
             param_grid={'n_neighbors': range(1, 5)}, scoring='accuracy')

In [12]:
f'{grid.best_score_}'

'0.9296878404033488'

In [13]:
f'{grid.best_params_}'

"{'n_neighbors': 4}"

Testing

In [14]:
clf_knn =  KNeighborsClassifier(n_neighbors=grid.best_params_['n_neighbors'])
clf_knn.fit(Xtr, Ytr)

y_pred = clf_knn.predict(Xte)

Performance

In [15]:
acc = accuracy_score(Yte, y_pred)
macro_f1 = f1_score(Yte, y_pred, average='macro')
micro_f1 = f1_score(Yte, y_pred, average='micro')

f'{acc}, {macro_f1}, {micro_f1}'

'0.9464285714285714, 0.9442512742303932, 0.9464285714285714'

### Task 2.3 - Use Logistic Regression to do document classification

Training with cross validation

In [16]:
coeff = range(1, 10)
param_grid = dict(C=coeff)

clf_lr = LogisticRegression(penalty='l2')

grid = GridSearchCV(clf_lr, param_grid, cv=5, scoring='accuracy')
grid.fit(Xtr, Ytr)

GridSearchCV(cv=5, estimator=LogisticRegression(),
             param_grid={'C': range(1, 10)}, scoring='accuracy')

In [17]:
f'{grid.best_params_}'

"{'C': 5}"

Testing

In [18]:
clf_lr = LogisticRegression(penalty='l2', C=grid.best_params_['C'])
clf_lr.fit(Xtr, Ytr)

y_pred = clf_lr.predict(Xte)

Performance

In [19]:
acc = accuracy_score(Yte, y_pred)
macro_f1 = f1_score(Yte, y_pred, average='macro')
micro_f1 = f1_score(Yte, y_pred, average='micro')

f'{acc}, {macro_f1}, {micro_f1}'

'0.9732142857142857, 0.9737690086489567, 0.9732142857142857'

### Task 2.4 - Use K-means to do document clustering and find the 10 most representative words in each cluster

Use K-means to partition this dataset into 5 clusters

In [20]:
cluster = KMeans(n_clusters=5, random_state=42).fit(Xtr)

Get the clustering centroids

In [21]:
centroids = cluster.cluster_centers_

Get the top 10 most representative words in each cluster

In [22]:
order_centroids = centroids.argsort()[:, ::-1]
terms = vectorizer.get_feature_names()

for i in range(5):
    print("Cluster %d:" % i),
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print('')

Cluster 0:
 mr
 labour
 election
 blair
 said
 party
 government
 brown
 minister
 howard

Cluster 1:
 england
 game
 said
 win
 cup
 season
 chelsea
 match
 world
 injury

Cluster 2:
 said
 people
 music
 mobile
 mr
 new
 uk
 software
 tv
 phone

Cluster 3:
 said
 growth
 economy
 bank
 year
 oil
 market
 firm
 mr
 economic

Cluster 4:
 film
 best
 awards
 actor
 award
 festival
 films
 oscar
 director
 actress

