# **Question 1b:​ Classification Task with a modified vectorizer**

In [None]:
import pandas as pd
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


Loading train data

In [None]:
# df = pd.read_csv("train.csv")
df = pd.read_csv('/content/drive/MyDrive/datasets-data analysis/data/q1/train.csv')
df

Unnamed: 0,Id,Title,Content,Label
0,227464,"Netflix is coming to cable boxes, and Amazon i...",if you subscribe to one of three rinky-dink (...,Entertainment
1,244074,"Pharrell, Iranian President React to Tehran 'H...","pharrell, iranian president react to tehran '...",Entertainment
2,60707,Wildlife service seeks comments,the u.s. fish and wildlife service has reopen...,Technology
3,27883,Facebook teams up with Storyful to launch 'FB ...,the very nature of social media means it is o...,Technology
4,169596,Caesars plans US$880 mln New York casino,caesars plans us$880 mln new york casino jul ...,Business
...,...,...,...,...
111790,31462,Microsoft requires Office 2013 licensing for s...,in contrast to the muckle of special licenses...,Technology
111791,100821,Smallpox vials missing since 1950s found in la...,government workers at a research center near ...,Health
111792,86181,Scientists May Have Just Discovered the Key to...,harvard scientists may have just unlocked the...,Health
111793,256423,Justin Bieber to plead guilty to DUI,"justin bieber to plead guilty to duifri, 13 ju...",Entertainment


Loading test data

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/datasets-data analysis/data/q1/test_without_labels.csv')
df_test

Unnamed: 0,Id,Title,Content
0,262120,Tracy Morgan upgraded to fair condition after ...,actor and comedian tracy morgan has been upgr...
1,175132,Smartphones Weigh on Samsung Electronics as Gu...,samsung electronics co ltd on tuesday issued u...
2,218739,FBI denies fumbling testimony on 'X-Men' direc...,michael f. egan iii said in a press conferenc...
3,253483,Bachelorette 2014 Spoilers: Week 3 Recap ??? E...,i am having mixed emotions for what is about ...
4,224109,Barack Obama honours Frankie Knuckles in lette...,u.s. president barack obama has paid a specia...
...,...,...,...
47907,50348,"BMW, Tesla meet to discuss standardizing elect...","june 16, 2014 by edward taylor reutersan emplo..."
47908,255044,Harrison Ford has been filming the seventh Sta...,he may have helped save the galaxy from the ev...
47909,66502,"It's Games, Games, Games As Microsoft Plans To...",less than three months after microsoft had a ...
47910,10319,App Detail » Microsoft Excel for iPad,app description *** excel is ready for ipad p...


In [None]:
df.Label.unique()

array(['Entertainment', 'Technology', 'Business', 'Health'], dtype=object)

Checking for duplicate rows

In [None]:
df.duplicated().unique()

array([False])

**STOPWORDS**

In [None]:
from wordcloud import STOPWORDS

In [None]:
stopwords = set(STOPWORDS)
stopwords.update(["said", "say", "may", "says", "one", "even", "now", "well", "will"])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1,2), min_df=2, analyzer='word', strip_accents='unicode', decode_error='replace', stop_words = stopwords)
tfidf_vectorizer.fit(df['Title']+df['Content'])
print("some sample features(unique words in the corpus)",tfidf_vectorizer.get_feature_names()[0:10])
print('='*50)
tfidf_vect_content = tfidf_vectorizer.transform(df['Content'])

  'stop_words.' % sorted(inconsistent))


some sample features(unique words in the corpus) ['00', '00 00', '00 pm', '000', '000 000', '000 barrels', '000 bitcoins', '000 cars', '000 copies', '000 employees']


In [None]:
tfidf_vect_content_test = tfidf_vectorizer.transform(df_test['Content'])

In [None]:
print(tfidf_vect_content.shape)

(111795, 10000)


In [None]:
print(tfidf_vect_content_test.shape)

(47912, 10000)


# Trancated SVD

In [None]:
from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
tfidf_svd_model = TruncatedSVD(n_components=500)

tfidf_svd_model.fit(tfidf_vect_content)

TruncatedSVD(algorithm='randomized', n_components=500, n_iter=5,
             random_state=None, tol=0.0)

In [None]:
X_tfidf = tfidf_svd_model.transform(tfidf_vect_content)

len(tfidf_svd_model.components_)

500

# Classification

In [None]:
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier

from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, recall_score, roc_auc_score, precision_score

from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split


y = df['Label']
K = 5; # K fold cross validation

**KNN (BoW)**

In [None]:
kf = KFold(n_splits=5)
X = tfidf_vect_content
kf.get_n_splits(X)
print(kf)

accuracy = 0
F1_score = 0
precision = 0
recall = 0

for train_index, test_index in kf.split(X):

  # print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  model = KNeighborsClassifier(n_neighbors=5)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  accuracy += accuracy_score(y_test, y_pred)
  F1_score += f1_score(y_test, y_pred, average='macro') 
  precision += precision_score(y_test, y_pred, average='macro')
  recall += recall_score(y_test, y_pred, average='macro')

print(accuracy/K, F1_score/K, precision/K, recall/K)

KFold(n_splits=5, random_state=None, shuffle=False)
0.9694530166823203 0.9662307795060622 0.9667014912470094 0.9657788703654884


**KNN (SVD)**

In [None]:
kf = KFold(n_splits=5)
X = X_tfidf
kf.get_n_splits(X)
print(kf)

accuracy = 0
F1_score = 0
precision = 0
recall = 0

for train_index, test_index in kf.split(X):

  # print("TRAIN:", train_index, "TEST:", test_index)
  X_train, X_test = X[train_index], X[test_index]
  y_train, y_test = y[train_index], y[test_index]

  model = KNeighborsClassifier(n_neighbors=5)
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)

  accuracy += accuracy_score(y_test, y_pred)
  F1_score += f1_score(y_test, y_pred, average='macro') 
  precision += precision_score(y_test, y_pred, average='macro')
  recall += recall_score(y_test, y_pred, average='macro')

print(accuracy/K, F1_score/K, precision/K, recall/K)

KFold(n_splits=5, random_state=None, shuffle=False)
0.9631557761975044 0.9595273062118979 0.9621169405508695 0.9570819207040999


# **Validation accuracy**

In [None]:
final_model = KNeighborsClassifier(n_neighbors=5)
final_model.fit(tfidf_vect_content, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [None]:
final_predictions = final_model.predict(tfidf_vect_content_test)

In [None]:
final_output = pd.DataFrame({

    "Id": df_test["Id"], 
    "Predicted": final_predictions})

final_output

Unnamed: 0,Id,Predicted
0,262120,Entertainment
1,175132,Business
2,218739,Entertainment
3,253483,Entertainment
4,224109,Entertainment
...,...,...
47907,50348,Technology
47908,255044,Entertainment
47909,66502,Technology
47910,10319,Business


In [None]:
final_output.to_csv('/content/drive/MyDrive/datasets-data analysis/data/q1/testSet_categories.csv', sep=',', index=False)