## BERT

In [1]:
import warnings
# warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import re

from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import tensorflow as tf
from keras.preprocessing.text import Tokenizer, text_to_word_sequence

In [2]:
tf.config.list_physical_devices()

[PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU'),
 PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [5]:
#!pip install ktrain --upgrade

In [6]:
import ktrain
from ktrain import text

In [2]:
yelp = pd.read_json('https://storage.googleapis.com/msca-bdp-data-open/yelp/yelp_train_sentiment.json', orient='records', lines=True).head(50000)
yelp.shape

(50000, 3)

In [19]:
sentiment = {0: "Negative", 1: "Positive"}
yelp['sentiment'] = yelp['label'].map(sentiment)

In [20]:
df = yelp[['text', 'sentiment']].rename(columns={'text':'data', 'sentiment':'target'})

In [21]:
df

Unnamed: 0,data,target
0,I love Deagan's. I do. I really do. The atmosp...,Positive
1,I love the classes at this gym. Zumba and. Rad...,Positive
2,The tables and floor were dirty. I was the onl...,Negative
3,I had an oil change at the 15515 N Scottsdale ...,Negative
4,The absolute WORST apartment complex I have ev...,Negative
...,...,...
49995,"Would give no stars if possible. Scam artists,...",Negative
49996,What an amazing job! BBQ had been dormant for...,Positive
49997,I tried to buy chicken on sale and the manager...,Negative
49998,Been in several times since reopening in late ...,Negative


In [23]:
maxLen = 200 #each document can be of most <maxLen> words. 0 is used as padding ID.
nGramRange = 1 #size of multi-word phrases to consider
preprocessMode='bert' #Either 'standard' (normal tokenization) or 'bert' tokenization and preprocessing for use with BERT text classification model.
sampleSize = 0.3 #Proportion of training to use for validation

(x_train,  y_train), (x_test, y_test), preproc = text.texts_from_df(train_df = df, 
                                                                    text_column  = 'data', 
                                                                    label_columns = ['target'],
                                                                       val_pct=sampleSize,
                                                                       preprocess_mode=preprocessMode, #text must be preprocessed in a specific way for use with BERT
                                                                       maxlen=maxLen)

['Negative', 'Positive']
       Negative  Positive
31092       0.0       1.0
28966       0.0       1.0
30770       1.0       0.0
15777       1.0       0.0
12366       0.0       1.0
['Negative', 'Positive']
       Negative  Positive
6653        1.0       0.0
47104       0.0       1.0
10234       0.0       1.0
49618       1.0       0.0
20542       1.0       0.0
preprocessing train...
language: en


Is Multi-Label? False
preprocessing test...
language: en


In [24]:
model = text.text_classifier('bert', (x_train, y_train), preproc=preproc)

Is Multi-Label? False
maxlen is 200
done.


In [25]:
batchSize = 16 ### Check best size

learner = ktrain.get_learner(model, 
                             train_data=(x_train, y_train), 
                             val_data=(x_test, y_test), 
                             batch_size=batchSize)

In [26]:
learningRate = 2e-5
numEpoch = 3

# learner.fit_onecycle(learningRate, numEpoch)
learner.autofit(learningRate, numEpoch)



begin training using triangular learning rate policy with max lr of 2e-05...
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fa808526350>

In [27]:
learner.validate(val_data=(x_test, y_test))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      7504
           1       0.99      0.99      0.99      7496

    accuracy                           0.99     15000
   macro avg       0.99      0.99      0.99     15000
weighted avg       0.99      0.99      0.99     15000



array([[7426,   78],
       [ 106, 7390]])

In [28]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [42]:
predictor.save(dataPath)

In [38]:
!fusermount -u drive

fusermount: failed to unmount /content/drive: Invalid argument


In [40]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [41]:
dataPath = "/content/drive/MyDrive/NLP/"

In [44]:
news = pd.read_csv(dataPath + "data_with_topics_cluster.csv")

In [47]:
news

Unnamed: 0.1,Unnamed: 0,index,date,language,title,text,text_clean,cluster,fist_topic,second_topic,third_topic,Topic_cluster
0,0,0,2022-01-06,english,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,chicago ap chicago school leaders canceled...,0,"(1, 0.99801016)",,,7
1,1,1,2022-01-06,english,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,chicago ap chicago school leaders canceled...,0,"(1, 0.9981448)",,,7
2,2,2,2022-01-06,english,Watch ‘Chicago P.D.’ Preview Wednesday,Chicago PD 9×11 “Lies” Season 9 Episode 11 Pro...,chicago pd lies season episode pro...,0,"(6, 0.68915355)","(0, 0.21457739)","(17, 0.06135006)",0
3,3,3,2022-01-06,english,Trump’s Solution for Chicago Public Schools? K...,Wise of Foolish?\nThe Chicago Tribune article ...,wise of foolish the chicago tribune article t...,0,"(12, 0.45712763)","(1, 0.32274714)","(6, 0.07527004)",1
4,4,4,2022-01-06,english,"Family, friends, colleagues gathering today at...",The neighbors near Mashawn Plummer’s Portage P...,the neighbors near mashawn plummer s portage p...,0,"(6, 0.56970793)","(2, 0.33617198)","(3, 0.0802123)",0
...,...,...,...,...,...,...,...,...,...,...,...,...
181866,181866,200114,2022-03-05,english,Chicago girl shot after celebrating 12th birth...,Authorities say a girl who was shot in the hea...,authorities say a girl who was shot in the hea...,0,"(2, 0.8981696)","(0, 0.013380353)","(1, 0.010981139)",0
181867,181867,200115,2022-03-05,english,Loyola Chicago vs. Northern Iowa – MVC Tournam...,"The No. 4 seed Loyola Chicago Ramblers (23-7, ...",the no seed loyola chicago ramblers ...,0,"(15, 0.6627376)","(3, 0.17722566)","(14, 0.08331981)",11
181868,181868,200116,2022-03-05,english,"DraftKings Illinois Promo Code: $1,050 Bonus a...",Remember to check out the DraftKings Illinois ...,remember to check out the draftkings illinois ...,0,"(10, 0.4513439)","(16, 0.3355821)","(13, 0.1768964)",0
181869,181869,200117,2022-03-05,english,"Sidney, Illinois had a median home valuation o...",[BlockShopper.com] .\n|Buyer||Address||Valuati...,buyer address valuation jennifer a ...,0,"(11, 0.9248376)","(18, 0.033204958)","(12, 0.028403208)",0


In [None]:
reloaded_predictor = ktrain.load_predictor('/home/jupyter/data/ktrain/bert/my_predictor')

In [49]:
predicted = predictor.predict(news.text.to_list())
data = news.text

results = pd.DataFrame(list(zip(predicted, data)), 
               columns =['predicted', 'data']) 

In [50]:
results

Unnamed: 0,predicted,data
0,Negative,CHICAGO (AP) — Chicago school leaders canceled...
1,Negative,CHICAGO (AP) — Chicago school leaders canceled...
2,Negative,Chicago PD 9×11 “Lies” Season 9 Episode 11 Pro...
3,Negative,Wise of Foolish?\nThe Chicago Tribune article ...
4,Positive,The neighbors near Mashawn Plummer’s Portage P...
...,...,...
181866,Negative,Authorities say a girl who was shot in the hea...
181867,Negative,"The No. 4 seed Loyola Chicago Ramblers (23-7, ..."
181868,Negative,Remember to check out the DraftKings Illinois ...
181869,Negative,[BlockShopper.com] .\n|Buyer||Address||Valuati...


In [52]:
results.to_pickle(dataPath + "sentiment_result.pkl")

## Sklearn

In [1]:
import sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, HashingVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import make_pipeline
from sklearn import metrics

from joblib import dump, load

In [3]:
yelp = pd.read_json('https://storage.googleapis.com/msca-bdp-data-open/yelp/yelp_train_sentiment.json', orient='records', lines=True)
yelp.shape

(255717, 3)

In [4]:
yelp

Unnamed: 0,text,label,lang
0,I love Deagan's. I do. I really do. The atmosp...,1,en
1,I love the classes at this gym. Zumba and. Rad...,1,en
2,The tables and floor were dirty. I was the onl...,0,en
3,I had an oil change at the 15515 N Scottsdale ...,0,en
4,The absolute WORST apartment complex I have ev...,0,en
...,...,...,...
255712,Ok I get it. Ross probably isn't the most desi...,0,en
255713,You know the food there is and have bad. The D...,0,en
255714,Update to the response: you aren't going to co...,0,en
255715,I would give a minus score if it were possible...,0,en


In [5]:
X = yelp['text']
y = yelp['label']
print(X.shape)
print(y.shape)

(255717,)
(255717,)


In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=123)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(191787,)
(63930,)
(191787,)
(63930,)


### Logistic Regreszion

In [7]:
pipe_logreg = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    LogisticRegression(max_iter=1000)
)

In [9]:
%time pipe_logreg.fit(X_train, y_train)

CPU times: user 14min 54s, sys: 20min 31s, total: 35min 26s
Wall time: 21min 54s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('logisticregression', LogisticRegression(max_iter=1000))])

In [10]:
y_pred = pipe_logreg.predict(X_test)
print(f"Test Accuracy: {metrics.accuracy_score(y_test, y_pred) * 100:.5f}%")

Test Accuracy: 97.25168%


In [11]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32016
           1       0.97      0.97      0.97     31914

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [13]:
%time dump(pipe_logreg, "logreg.joblib")

CPU times: user 1min 5s, sys: 19.5 s, total: 1min 25s
Wall time: 1min 57s


['logreg.joblib']

### SVM

In [14]:
pipe_svm = make_pipeline(
    CountVectorizer(lowercase=False, stop_words='english', ngram_range=(1,3)),
    SGDClassifier(max_iter=100, tol=None)
)

In [15]:
%time pipe_svm.fit(X_train, y_train)

CPU times: user 2min 2s, sys: 3min 50s, total: 5min 52s
Wall time: 9min 31s


Pipeline(steps=[('countvectorizer',
                 CountVectorizer(lowercase=False, ngram_range=(1, 3),
                                 stop_words='english')),
                ('sgdclassifier', SGDClassifier(max_iter=100, tol=None))])

In [16]:
y_pred = pipe_svm.predict(X_test)

In [17]:
print(metrics.accuracy_score(y_test, y_pred))

0.9735022681057407


In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      0.97      0.97     32016
           1       0.97      0.97      0.97     31914

    accuracy                           0.97     63930
   macro avg       0.97      0.97      0.97     63930
weighted avg       0.97      0.97      0.97     63930



In [19]:
%time dump(pipe_svm, "svm.joblib")

CPU times: user 1min 3s, sys: 15.6 s, total: 1min 18s
Wall time: 1min 45s


['svm.joblib']

In [20]:
news = pd.read_pickle("data_with_topic_cluster_v2.pkl")

In [22]:
cl = pipe_logreg.predict(news.text_clean)

In [26]:
cl_prob = pipe_logreg.predict_proba(news.text_clean)

In [31]:
cl_prob
cl_prob = pd.DataFrame(cl_prob)

In [32]:
cl_prob

Unnamed: 0,0,1
0,1.000000,5.045681e-16
1,1.000000,4.040775e-19
2,0.246794,7.532063e-01
3,1.000000,5.159463e-09
4,0.004937,9.950635e-01
...,...,...
181866,0.814880,1.851205e-01
181867,0.000028,9.999719e-01
181868,0.414387,5.856135e-01
181869,0.559324,4.406761e-01


In [33]:
news["sentiment"] = cl
news["sentiment_prob_0"]= cl_prob.iloc[:,0]
news["sentiment_prob_1"] = cl_prob.iloc[:,1]

In [34]:
news

Unnamed: 0,date,language,title,text,text_clean,cluster,topic1,w1,topic2,w2,topic3,w3,Topic_cluster,sentiment,sentiment_prob_0,sentiment_prob_1
0,2022-01-06,english,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,chicago ap chicago school leaders canceled...,0,1,0.998010,,,,,7,0,1.000000,5.045681e-16
1,2022-01-06,english,"No deal, no school: Chicago cancels classes fo...",CHICAGO (AP) — Chicago school leaders canceled...,chicago ap chicago school leaders canceled...,0,1,0.998145,,,,,10,0,1.000000,4.040775e-19
2,2022-01-06,english,Watch ‘Chicago P.D.’ Preview Wednesday,Chicago PD 9×11 “Lies” Season 9 Episode 11 Pro...,chicago pd lies season episode pro...,0,6,0.689289,0.0,0.214598,17.0,0.061193,4,1,0.246794,7.532063e-01
3,2022-01-06,english,Trump’s Solution for Chicago Public Schools? K...,Wise of Foolish?\nThe Chicago Tribune article ...,wise of foolish the chicago tribune article t...,0,12,0.457139,1.0,0.322744,6.0,0.075293,4,0,1.000000,5.159463e-09
4,2022-01-06,english,"Family, friends, colleagues gathering today at...",The neighbors near Mashawn Plummer’s Portage P...,the neighbors near mashawn plummer s portage p...,0,6,0.569758,2.0,0.336124,3.0,0.080211,1,1,0.004937,9.950635e-01
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181866,2022-03-05,english,Chicago girl shot after celebrating 12th birth...,Authorities say a girl who was shot in the hea...,authorities say a girl who was shot in the hea...,0,2,0.898170,0.0,0.013380,1.0,0.010981,4,0,0.814880,1.851205e-01
181867,2022-03-05,english,Loyola Chicago vs. Northern Iowa – MVC Tournam...,"The No. 4 seed Loyola Chicago Ramblers (23-7, ...",the no seed loyola chicago ramblers ...,0,15,0.662747,3.0,0.177199,14.0,0.083335,2,1,0.000028,9.999719e-01
181868,2022-03-05,english,"DraftKings Illinois Promo Code: $1,050 Bonus a...",Remember to check out the DraftKings Illinois ...,remember to check out the draftkings illinois ...,0,10,0.451471,16.0,0.335602,13.0,0.176749,4,0,0.414387,5.856135e-01
181869,2022-03-05,english,"Sidney, Illinois had a median home valuation o...",[BlockShopper.com] .\n|Buyer||Address||Valuati...,buyer address valuation jennifer a ...,0,11,0.924808,18.0,0.033204,12.0,0.028433,8,1,0.559324,4.406761e-01


In [35]:
news.to_pickle("sentiment_lg.pkl")