In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import model_selection, svm
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier


[nltk_data] Downloading package wordnet to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /usr/local/share/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /usr/local/share/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Reading and preprocessing 6k annotated data

In [118]:
df = pd.read_csv("~/data.txt", delimiter=',',encoding = "utf-8")
df


Unnamed: 0,text,label
0,My Cozy Glam Aaliyah inspired OOTD aaliyahinsp...,0
1,What a guy #chubbygirlsdoitbetter beyourself...,1
2,Beautiful disaster,0
3,Part Mom,0
4,Depressed piece of shit,0
5,math teacher calls irrational numbers female n...,0
6,How did you manage to blow through a diaper AN...,1
7,Thick thighs Thin patience,1
8,i once got told by my boss that i was hired be...,1
9,a man stood next to me leant close in over my ...,1


In [119]:
df['text'] = [entry.lower() for entry in df['text']]
df['text']= [word_tokenize(entry) for entry in df['text']]


In [120]:
Corpus  = df
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    Corpus.loc[index,'text'] = str(Final_words)
Corpus.head()

Unnamed: 0,text,label
0,"['cozy', 'glam', 'aaliyah', 'inspire', 'ootd',...",0
1,"['guy', 'chubbygirlsdoitbetter', 'beyourself',...",1
2,"['beautiful', 'disaster']",0
3,"['part', 'mom']",0
4,"['depressed', 'piece', 'shit']",0


In [121]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text'],Corpus['label'],test_size=0.2)

In [169]:
type(Corpus['text'])

pandas.core.series.Series

In [123]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)


# Running the best model on that data

In [124]:
kfold = model_selection.KFold(n_splits=5)
c = [0.1, 1, 10, 100, 1000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
hyperparameters = {'C': c,'gamma': gamma, 'kernel': ['rbf']}  
SVM = svm.SVC(probability=True)
clf = GridSearchCV(SVM, hyperparameters, cv=5, verbose=0)
best_model = clf.fit(Train_X_Tfidf,Train_Y)

results = model_selection.cross_val_score(SVM, Train_X_Tfidf, Train_Y, cv=kfold)
print("Acc:",results.mean()*100)

predictions_SVM = best_model.predict(Test_X_Tfidf)

print("Accuracy -> ",accuracy_score( Test_Y,predictions_SVM)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_SVM, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_SVM, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_SVM, average="macro")) 



Acc: 78.3034193604449
Accuracy ->  92.66666666666666
F1 Score ->  0.8835850512622643
Precision ->  0.9165648155221144
Recall ->  0.8590085419444033


In [125]:
best_model.probability = True
best_model.predict_proba(Test_X_Tfidf)


array([[0.98215345, 0.01784655],
       [0.48395589, 0.51604411],
       [0.97984215, 0.02015785],
       ...,
       [0.96936053, 0.03063947],
       [0.96934121, 0.03065879],
       [0.58464376, 0.41535624]])

### Reading and Preprocessing the 45k data

In [126]:
df2 = pd.read_csv("~/45k.txt", delimiter=',',engine='python',encoding = "utf-8")
df2['text'] = [entry.lower() for entry in df2['text']]
df2['text']= [word_tokenize(entry) for entry in df2['text']]

In [49]:
best_model.predict_proba(Test_X_Tfidf)

array([[0.94558194, 0.05441806],
       [0.96669913, 0.03330087],
       [0.03741548, 0.96258452],
       ...,
       [0.99083513, 0.00916487],
       [0.97331543, 0.02668457],
       [0.96750679, 0.03249321]])

In [127]:
Corpus2  = df2
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(Corpus2['text']):
    Final_words = []
    word_Lemmatized = WordNetLemmatizer()
    for word, tag in pos_tag(entry):
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    Corpus2.loc[index,'text'] = str(Final_words)
Corpus2.head()

Unnamed: 0,text
0,"['c', 'p', 'n', 'need', 'spend', 'day', 'art',..."
1,"['humble', 'please', 'stop', 'body', 'sham', '..."
2,"['hate', 'part', 'someone', 'tell', 'see', 'sm..."
3,['pretty']
4,"['check', 'instagram', 'story', 'know', 'love'..."


In [128]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus2['text'])
big_data = Tfidf_vect.transform(Corpus2['text'])
new_results_prob = best_model.predict_proba(big_data)
new_results_label = best_model.predict(big_data)

#### Adding the selected data with a high degree of certainty back to the corpus and training the model again

In [195]:
new_dict = {'text' : [] , 'label':[] }

for i in range(len(new_results_prob)):
    if(new_results_prob[i][1] >= 0.8):
        new_dict['text'].append(Corpus2.iloc[i]['text'])
        new_dict['label'].append(1)
    
data_2 = pd.DataFrame(new_dict)
Corpus.append(data_2)


In [198]:
Corpus = Corpus.reset_index(drop=True)
Corpus

Unnamed: 0,text,label
0,"['cozy', 'glam', 'aaliyah', 'inspire', 'ootd',...",0
1,"['guy', 'chubbygirlsdoitbetter', 'beyourself',...",1
2,"['beautiful', 'disaster']",0
3,"['part', 'mom']",0
4,"['depressed', 'piece', 'shit']",0
5,"['math', 'teacher', 'call', 'irrational', 'num...",0
6,"['manage', 'blow', 'diaper', 'clothes']",1
7,"['thick', 'thigh', 'thin', 'patience']",1
8,"['get', 'tell', 'bos', 'hire', 'tit', 'teeth']",1
9,"['man', 'stand', 'next', 'leant', 'close', 'sh...",1


In [199]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text'],Corpus['label'],test_size=0.2)

In [201]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [202]:
kfold = model_selection.KFold(n_splits=5)
c = [0.1, 1, 10, 100, 1000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
hyperparameters = {'C': c,'gamma': gamma, 'kernel': ['rbf']}  
SVM = svm.SVC(probability=True)
clf = GridSearchCV(SVM, hyperparameters, cv=5, verbose=0)
best_model = clf.fit(Train_X_Tfidf,Train_Y)

results = model_selection.cross_val_score(SVM, Train_X_Tfidf, Train_Y, cv=kfold)
print("Acc:",results.mean()*100)

predictions_SVM = best_model.predict(Test_X_Tfidf)

print("Accuracy -> ",accuracy_score( Test_Y,predictions_SVM)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_SVM, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_SVM, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_SVM, average="macro"))



Acc: 77.83558792924038
Accuracy ->  93.17803660565723
F1 Score ->  0.8894114901466226
Precision ->  0.9093470936123089
Recall ->  0.8727541120079645


## Second Iteration

In [203]:
new_results_prob = best_model.predict_proba(big_data)
new_results_label = best_model.predict(big_data)

In [204]:
new_dict = {'text' : [] , 'label':[] }

for i in range(len(new_results_prob)):
    if(new_results_prob[i][1] >= 0.8):
        new_dict['text'].append(Corpus2.iloc[i]['text'])
        new_dict['label'].append(1)
    
data_2 = pd.DataFrame(new_dict)


In [222]:
Train_X, Test_X, Train_Y, Test_Y = model_selection.train_test_split(Corpus['text'],Corpus['label'],test_size=0.2)


In [223]:
Tfidf_vect = TfidfVectorizer(max_features=5000)
Tfidf_vect.fit(Corpus['text'])
Train_X_Tfidf = Tfidf_vect.transform(Train_X)
Test_X_Tfidf = Tfidf_vect.transform(Test_X)

In [224]:
kfold = model_selection.KFold(n_splits=5)
c = [0.1, 1, 10, 100, 1000]
gamma = [1, 0.1, 0.01, 0.001, 0.0001]
hyperparameters = {'C': c,'gamma': gamma, 'kernel': ['rbf']}  
SVM = svm.SVC(probability=True)
clf = GridSearchCV(SVM, hyperparameters, cv=5, verbose=0)
best_model = clf.fit(Train_X_Tfidf,Train_Y)

results = model_selection.cross_val_score(SVM, Train_X_Tfidf, Train_Y, cv=kfold)
print("Acc:",results.mean()*100)

predictions_SVM = best_model.predict(Test_X_Tfidf)

print("Accuracy -> ",accuracy_score( Test_Y,predictions_SVM)*100)
print("F1 Score -> ",f1_score(Test_Y,predictions_SVM, average="macro"))
print("Precision -> ",precision_score(Test_Y, predictions_SVM, average="macro"))
print("Recall -> ",recall_score(Test_Y,predictions_SVM, average="macro")) 



Acc: 78.53239051094891
Accuracy ->  92.5
F1 Score ->  0.8820734114851763
Precision ->  0.927915699753346
Recall ->  0.8512172624870726
