In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import textblob as blob

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import recall_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import *
from keras.layers import *
from keras.callbacks import *
from keras.models import load_model
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization

#Loading Data and Data Analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')
 #Under FindMyExpertDataset 

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
from google.colab import files
upload = files.upload()

In [None]:
df = pd.read_csv('/content/drive/MyDrive/FindMyExpertDataset/UScomments.csv',error_bad_lines=False)
df.head()

b'Skipping line 41589: expected 4 fields, saw 11\nSkipping line 51628: expected 4 fields, saw 7\nSkipping line 114465: expected 4 fields, saw 5\n'
b'Skipping line 142496: expected 4 fields, saw 8\nSkipping line 189732: expected 4 fields, saw 6\nSkipping line 245218: expected 4 fields, saw 7\n'
b'Skipping line 388430: expected 4 fields, saw 5\n'
  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,video_id,comment_text,likes,replies
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0
4,XpVt6Z1Gjjo,trending 😉,3,0


In [None]:
df2 = pd.read_csv('USvideos.csv',error_bad_lines=False)
df2.head()

In [None]:
df['sentiment'] = (df['comment_text'].astype(str)).apply(lambda x: blob.TextBlob(x).sentiment.polarity)

In [None]:
df['sentiment_score'] = df['sentiment'].apply(lambda s : 1 if s > 0 else (0 if s == 0 else -1))

In [None]:
df.head()

Unnamed: 0,video_id,comment_text,likes,replies,sentiment,sentiment_score
0,XpVt6Z1Gjjo,Logan Paul it's yo big day ‼️‼️‼️,4,0,0.0,0
1,XpVt6Z1Gjjo,I've been following you from the start of your...,3,0,0.0,0
2,XpVt6Z1Gjjo,Say hi to Kong and maverick for me,3,0,0.0,0
3,XpVt6Z1Gjjo,MY FAN . attendance,3,0,0.0,0
4,XpVt6Z1Gjjo,trending 😉,3,0,0.0,0


In [None]:
df.shape

(691400, 6)

In [None]:
df.isnull().sum()

video_id            0
comment_text       25
likes               0
replies             0
sentiment           0
sentiment_score     0
dtype: int64

In [None]:
df.dropna(inplace=True)

In [None]:
df.isnull().sum()

video_id           0
comment_text       0
likes              0
replies            0
sentiment          0
sentiment_score    0
dtype: int64

In [None]:
train, test = train_test_split(df, test_size=0.3)

In [None]:
train.shape

(483962, 6)

#Data Preprocessing

Remove punctuation, numbers, special characters and emojis

In [None]:
train['comment_text'] = train['comment_text'].str.replace("[^a-zA-Z#]", " ")

Remove short words

In [None]:
train['comment_text'] = train['comment_text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

Lowercase comments

In [None]:
train['comment_text'] = train['comment_text'].apply(lambda x:x.lower())

In [None]:
train.head()

Unnamed: 0,video_id,comment_text,likes,replies,sentiment,sentiment_score
520928,H7H0ZWwWKCw,This is what California needs right now,0,0,0.285714,1
659009,eqTTNgyJm-o,People still play this game?,0,0,-0.4,-1
24665,g6piCiaGMZs,Talk about a challenge! You did a great job!,0,0,1.0,1
518611,BRhR9_tiPTo,Hi! It's Tiffany letting you know about the Sc...,0,0,0.2,1
83206,j3MvNvV27hw,Colleen you are amazing I've only been watchin...,0,0,0.4125,1


In [None]:
train.sentiment_score.value_counts()

 1    210515
 0    199047
-1     74400
Name: sentiment_score, dtype: int64

#Training model

In [None]:
train, valid = train_test_split(df, test_size=0.3)

In [None]:
type(train)

pandas.core.frame.DataFrame

In [None]:
train.shape

(483962, 6)

In [None]:
x_tr,y_tr=train['comment_text'].values,pd.get_dummies(train['sentiment_score'].values).values
x_val,y_val=valid['comment_text'].values,pd.get_dummies(valid['sentiment_score'].values).values
x_test,y_test=test['comment_text'].values,pd.get_dummies(test['sentiment_score'].values).values

In [None]:
x_tr

In [None]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(list(x_tr))

In [None]:
x_tr_seq=tokenizer.texts_to_sequences(x_tr)
x_val_seq=tokenizer.texts_to_sequences(x_val)
x_test_seq=tokenizer.texts_to_sequences(x_test)

In [None]:
x_tr_seq=pad_sequences(x_tr_seq,maxlen=100)
x_val_seq=pad_sequences(x_val_seq,maxlen=100)
x_test_seq=pad_sequences(x_test_seq,maxlen=100)

In [None]:
size_of_vocab=len(tokenizer.word_index)+1
size_of_vocab

163319

In [None]:
train.head()

Unnamed: 0,video_id,comment_text,likes,replies,sentiment,sentiment_score
598172,j0AoUB9Nqtw,"It's not a easy fight to predict, I reckon Gsp...",0,0,0.285556,1
21586,l_uNfGY3v8E,Maybe its catastrophic because its a goddamn t...,0,0,0.0,0
644433,3WEvgqcP8mg,Clarita!!!,0,0,0.0,0
301888,v90yrcg6q9I,Weekend at Bernie's 2017,0,0,0.0,0
331382,noVEvsozyug,God is REAL. He left something for Us to find ...,0,0,0.103125,1


In [None]:
#train_words_x = [ sentence.split(' ') for sentence in train['comment_text']]

In [None]:
# ds_train_words_x = tf.ragged.constant(train_words_x)
# ds_train_words_x.shape

In [None]:
tf.constant(x_tr).shape

In [None]:
vectorize_layer = TextVectorization(max_tokens = 10000, output_mode= 'int', output_sequence_length=100)
vectorized_text = vectorize_layer.adapt(tf.constant(x_tr))

In [None]:
model=Sequential()
model.add(tf.keras.Input(shape=(1,), dtype=tf.string))
model.add(vectorize_layer)
model.add(Embedding(10000,100,input_length=100,trainable=True))
model.add(LSTM(128,return_sequences=True,dropout=0.2))
model.add(GlobalMaxPool1D())
model.add(Dense(64,activation='relu'))
model.add(Dense(3,activation='softmax'))

In [None]:
model.compile(optimizer='adam',loss='categorical_crossentropy',metrics=['acc'])
es=EarlyStopping(monitor='val_loss',mode='min',verbose=1,patience=3)
mc=ModelCheckpoint('best_model.h5',monitor='val_acc',mode='max',save_best_only=True,verbose=1)

In [None]:
print(model.summary())

In [None]:
history=model.fit(x_tr,y_tr,batch_size=2048,epochs=10,verbose=1,,validation_data=(x_val,y_val)callbacks=[es,mc],validation_steps=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model=load_model('best_model.h5')

NotImplementedError: ignored

#Model Performance

In [None]:
_,val_acc=model.evaluate(x_val,y_val,batch_size=2048)
print(val_acc)

0.9781113266944885


In [None]:
sample_comment=["This is useless"]
prediction=model.predict(sample_comment)
prediction

array([[9.9786699e-01, 2.1010351e-03, 3.1960728e-05]], dtype=float32)

In [None]:
y_pred=model.predict(x_test) 
y_pred=np.argmax(y_pred, axis=1)
y_test=np.argmax(y_test, axis=1)
cm = confusion_matrix(y_test, y_pred)
print(cm)

[[30735   519   500]
 [  170 84888   320]
 [  390   699 89192]]


In [None]:
label_names = pd.Series(['negative', 'neutral', 'positive'])
pd.DataFrame(cm,
     columns='Predicted ' + label_names,
     index='Is ' + label_names)

Unnamed: 0,Predicted negative,Predicted neutral,Predicted positive
Is negative,30735,519,500
Is neutral,170,84888,320
Is positive,390,699,89192


In [None]:
print(classification_report(y_test, y_pred, target_names=label_names))

              precision    recall  f1-score   support

    negative       0.98      0.97      0.97     31754
     neutral       0.99      0.99      0.99     85378
    positive       0.99      0.99      0.99     90281

    accuracy                           0.99    207413
   macro avg       0.99      0.98      0.98    207413
weighted avg       0.99      0.99      0.99    207413



In [None]:
recall_score(y_test, y_pred, average=None)

array([0.96790955, 0.99426082, 0.98793766])

In [None]:
model.save('saved_model') 



INFO:tensorflow:Assets written to: saved_model/assets


INFO:tensorflow:Assets written to: saved_model/assets


In [None]:
!zip -r find_my_expert.zip /content/saved_model

  adding: content/saved_model/ (stored 0%)
  adding: content/saved_model/variables/ (stored 0%)
  adding: content/saved_model/variables/variables.data-00000-of-00001 (deflated 7%)
  adding: content/saved_model/variables/variables.index (deflated 61%)
  adding: content/saved_model/keras_metadata.pb (deflated 88%)
  adding: content/saved_model/saved_model.pb (deflated 85%)
  adding: content/saved_model/assets/ (stored 0%)


In [None]:
model.export(export_dir='.')

In [None]:
pip install -q tflite-model-maker

In [None]:
converter = tf.lite.TFLiteConverter.from_keras_model(model)

In [None]:
tflite_model = converter.convert()

In [None]:
open("findmyexpert.tflite","wb").write(tflite_model)