## Thought Analyzer 

In [1]:
import numpy as np
import pandas as pd
from zipfile import ZipFile
import time
import os

In [None]:
zip_object = ZipFile("/content/drive/MyDrive/ML Projects/Thought Analyzer Web App/data/twitter_sentiments.zip", 'r')

In [None]:
zip_object.namelist()

['twitter_sentiments.csv']

In [None]:
twitter_csv = zip_object.open('twitter_sentiments.csv')

In [None]:
df = pd.read_csv(twitter_csv, encoding='latin', header= None)
df.head()

Unnamed: 0,0,1,2,3,4,5
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [None]:
df.columns

Int64Index([0, 1, 2, 3, 4, 5], dtype='int64')

In [None]:
df.shape

(1600000, 6)

In [None]:
df[0].value_counts()

4    800000
0    800000
Name: 0, dtype: int64

In [None]:
messages = df[5].copy()
labels = df[0].copy()

In [None]:
labels = np.where(labels == 4, 1, 0)
labels = pd.Series(labels)

### Data Preprocessing

In [3]:
import nltk
nltk.download(['stopwords', 'wordnet'])
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
import re

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
def stemmed_message_process(message):
  stemmer = PorterStemmer()
  text_clener_re = '@\S+|https?:\S+|http?:\S|[^A-Za-z]+'
  message = re.sub(text_clener_re, ' ', message)
  message = message.lower().split()
  words = [stemmer.stem(word) for word in message if word not in stopwords.words('english')]
  words = ' '.join(words)

  return words

In [None]:
start = time.time()
stemmed_messages = messages.apply(stemmed_message_process)
end = time.time()
print("Time =", (end-start)/60)

Time = 56.96157401402791


In [None]:
stemmed_data = pd.DataFrame({'messages': stemmed_messages, 'labels': labels})
stemmed_data.to_csv("/content/drive/MyDrive/ML Projects/Thought Analyzer Web App/data/stemmed_data.csv", index = False)

In [70]:
stemmed_data = pd.read_csv('/content/drive/MyDrive/ML Projects/Thought Analyzer Web App/data/stemmed_data.csv')

In [71]:
stemmed_messages = stemmed_data['messages']
labels = stemmed_data['labels']

In [72]:
null_indexes = stemmed_messages[stemmed_messages.isnull()].index

In [73]:
# dropping null values
stemmed_messages.drop(index= null_indexes, inplace = True)
stemmed_messages.reset_index(drop = True , inplace = True)
labels.drop(index= null_indexes, inplace = True)
labels.reset_index(drop = True, inplace = True)

#### Getted into numbers
We are using imdb library for converting word into numbers

In [74]:
from tensorflow.keras.datasets import imdb

In [75]:
imdb_dict = imdb.get_word_index()

In [76]:
def word_to_num(text):
  num_list = []
  text = text.split()
  for word in text:
    if imdb_dict.get(word) != None:
      num_list.append(imdb_dict[word])
  return num_list

In [77]:
stem_number_data = stemmed_messages.apply(word_to_num)

In [78]:
stem_number_len = stem_number_data.apply(lambda x: len(x))

In [79]:
low_len_indexes = stem_number_len[(stem_number_len < 10) | (stem_number_len > 12)].index

In [80]:
# droping small values
stem_number_data.drop(index= low_len_indexes, inplace = True)
stem_number_data.reset_index(drop = True , inplace = True)
labels.drop(index= low_len_indexes, inplace = True)
labels.reset_index(drop = True, inplace = True)

### Pad Sequence

In [81]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [82]:
max_sent_len = 12

stem_padded_messages = pad_sequences(stem_number_data, max_sent_len)

# Train Test Split

In [83]:
stem_padded_messages.shape

(170661, 12)

In [84]:
labels.shape

(170661,)

In [85]:
from sklearn.model_selection import train_test_split
stem_x_train, stem_x_test, stem_y_train, stem_y_test = train_test_split(stem_padded_messages, labels, test_size = 0.3, random_state = 0)

In [86]:
stem_x_train.shape, stem_y_train.shape

((119462, 12), (119462,))

In [87]:
stem_x_test.shape, stem_y_test.shape

((51199, 12), (51199,))

## Machine Learing Model

### Logistic Regression Model

In [88]:
from sklearn.linear_model import LogisticRegression

In [89]:
LR = LogisticRegression()

In [90]:
LR.fit(stem_x_train,stem_y_train)

LogisticRegression()

In [91]:
lr_y_pred = LR.predict(stem_x_test)

In [92]:
from sklearn.metrics import confusion_matrix , accuracy_score
print(accuracy_score(stem_y_test,lr_y_pred))
confusion_matrix(stem_y_test,lr_y_pred)

0.5276860876188988


array([[24187,  2688],
       [21494,  2830]])

### Random Forest Classification

In [93]:
from sklearn.ensemble import RandomForestClassifier

In [94]:
R_F_C = RandomForestClassifier()

In [95]:
R_F_C.fit(stem_x_train, stem_y_train)

RandomForestClassifier()

In [96]:
rfc_y_pred = R_F_C.predict(stem_x_test)

In [97]:
print(accuracy_score(stem_y_test, rfc_y_pred))
print(confusion_matrix(stem_y_test,rfc_y_pred))

0.5688197035098341
[[19852  7023]
 [15053  9271]]


## Model

In [98]:
from tensorflow.keras.layers import Dense, LSTM, Embedding, Dropout, Activation
from tensorflow.keras.models import Sequential

In [99]:
voc_size = len(imdb_dict)
vector_dim = 1000

In [100]:
model = Sequential()
model.add(Embedding(voc_size, vector_dim, input_length=max_sent_len))
model.add(LSTM(300))
model.add(Dense(100, activation= 'relu'))
model.add(Dense(50, activation = 'relu'))
model.add(Dense(1, activation= 'sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [101]:
model.summary()

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 12, 1000)          88584000  
                                                                 
 lstm_8 (LSTM)               (None, 300)               1561200   
                                                                 
 dense_22 (Dense)            (None, 100)               30100     
                                                                 
 dense_23 (Dense)            (None, 50)                5050      
                                                                 
 dense_24 (Dense)            (None, 1)                 51        
                                                                 
Total params: 90,180,401
Trainable params: 90,180,401
Non-trainable params: 0
_________________________________________________________________


In [102]:
model.fit(stem_x_train, stem_y_train, validation_data= (stem_x_test,stem_y_test), epochs = 10, batch_size= 1000)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f3edd5a85d0>

In [None]:
model_name = "lstm_model_73acc.h5"

In [None]:
model_path = "/content/drive/MyDrive/ML Projects/Thought Analyzer Web App/" + model_name

In [None]:
model.save(model_path)