In [1]:
import numpy as np 
import pandas as pd

# Loading Data and EDA 


In [2]:
# Loading data, setting columns and initial expoloration

columns = ['target', 'id','timestamp','query','user','tweet']
tweet_data = pd.read_csv("training.1600000.processed.noemoticon.csv",header = None, names = columns, encoding='latin-1')
tweet_data['target'][tweet_data['target']==4]=1
tweet_data.tail()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,target,id,timestamp,query,user,tweet
1599995,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,1,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [3]:
# Exploring data
tweet_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1600000 entries, 0 to 1599999
Data columns (total 6 columns):
 #   Column     Non-Null Count    Dtype 
---  ------     --------------    ----- 
 0   target     1600000 non-null  int64 
 1   id         1600000 non-null  int64 
 2   timestamp  1600000 non-null  object
 3   query      1600000 non-null  object
 4   user       1600000 non-null  object
 5   tweet      1600000 non-null  object
dtypes: int64(2), object(4)
memory usage: 73.2+ MB


In [4]:
# Check null values
np.sum(tweet_data.isnull().any(axis=1))

0

In [5]:
# Sample choice: 20000 negative and positive tweets
neg = tweet_data[tweet_data['target'] == 0]
pos = tweet_data[tweet_data['target'] == 1]
pos.tail()

Unnamed: 0,target,id,timestamp,query,user,tweet
1599995,1,2193601966,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,AmandaMarie1028,Just woke up. Having no school is the best fee...
1599996,1,2193601969,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,2193601991,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,bpbabe,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,2193602064,Tue Jun 16 08:40:49 PDT 2009,NO_QUERY,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...
1599999,1,2193602129,Tue Jun 16 08:40:50 PDT 2009,NO_QUERY,RyanTrevMorris,happy #charitytuesday @theNSPCC @SparksCharity...


In [6]:
neg = neg.iloc[:int(20000)]
pos = pos.iloc[:int(20000)]
tweet_data = pd.concat([pos, neg])

# preprocessing 
### Steps to follow: 
**Uncaps text**

**Remove stopwords**

**Remove puncuation**

In [7]:
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
stopwords_list[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [8]:
tweet_data['tweet']=tweet_data['tweet'].str.lower()

In [9]:
import re
def remove_repeated_char(text):
    return re.sub(r'(.)\1+', r'\1', text)

tweet_data['tweet'] = tweet_data['tweet'].apply(lambda tweet :remove_repeated_char(tweet))

tweet_data['tweet'].head()

800000          i love @health4uandpets u guys r the best! 
800001    im meting up with one of my besties tonight! c...
800002    @darealsunisakim thanks for the twiter ad, sun...
800003    being sick can be realy cheap when it hurts to...
800004        @lovesbroklyn2 he has that efect on everyone 
Name: tweet, dtype: object

In [10]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords_list])

tweet_data['tweet'] = tweet_data['tweet'].apply(lambda tweet :remove_stopwords(tweet))

tweet_data['tweet'].head()

800000                 love @health4uandpets u guys r best!
800001    im meting one besties tonight! cant wait! - gi...
800002    @darealsunisakim thanks twiter ad, sunisa! got...
800003    sick realy cheap hurts much eat real fod plus,...
800004                        @lovesbroklyn2 efect everyone
Name: tweet, dtype: object

In [11]:
import string
punctuations_list = string.punctuation

def remove_punct(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

tweet_data['tweet'] = tweet_data['tweet'].apply(lambda tweet :remove_punct(tweet))

tweet_data['tweet'].head()

800000                   love health4uandpets u guys r best
800001    im meting one besties tonight cant wait  girl ...
800002    darealsunisakim thanks twiter ad sunisa got me...
800003    sick realy cheap hurts much eat real fod plus ...
800004                         lovesbroklyn2 efect everyone
Name: tweet, dtype: object

In [12]:
#removing @ signs and URLs 

def remove_URLs(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)

def remove_numeric(text):
    return  re.sub('[0-9]+', '', text)


tweet_data['tweet'] = tweet_data['tweet'].apply(lambda tweet: remove_URLs(tweet) )
tweet_data['tweet'] = tweet_data['tweet'].apply(lambda tweet: remove_numeric(tweet) )

tweet_data['tweet'].head()



800000                    love healthuandpets u guys r best
800001    im meting one besties tonight cant wait  girl ...
800002    darealsunisakim thanks twiter ad sunisa got me...
800003    sick realy cheap hurts much eat real fod plus ...
800004                          lovesbroklyn efect everyone
Name: tweet, dtype: object

In [13]:
# Tokenization 
from nltk.tokenize import word_tokenize

def tokenize_tweet(text):
    words = word_tokenize(text)
    return text.split()


tweet_data['tweet'] = tweet_data['tweet'].apply(lambda tweet: tokenize_tweet(tweet))

tweet_data['tweet'].head()

800000             [love, healthuandpets, u, guys, r, best]
800001    [im, meting, one, besties, tonight, cant, wait...
800002    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800003    [sick, realy, cheap, hurts, much, eat, real, f...
800004                      [lovesbroklyn, efect, everyone]
Name: tweet, dtype: object

In [14]:
# Lemmatization
import nltk
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

def lemmatize_tweet(tokens):
    data = [lemmatizer.lemmatize(word) for word in tokens ]
    return data

tweet_data['tweet']= tweet_data['tweet'].apply(lambda tweet : lemmatize_tweet(tweet))


In [15]:
# the finalized preprocessed tweets
tweet_data['tweet'].head()


800000              [love, healthuandpets, u, guy, r, best]
800001    [im, meting, one, besties, tonight, cant, wait...
800002    [darealsunisakim, thanks, twiter, ad, sunisa, ...
800003    [sick, realy, cheap, hurt, much, eat, real, fo...
800004                      [lovesbroklyn, efect, everyone]
Name: tweet, dtype: object

# Training preparaion 

In [16]:
# Separating labels & tweets 
X = tweet_data.tweet
Y = tweet_data.target


In [17]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence

max_len = 500
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [18]:
sequences_matrix.shape

(40000, 500)

In [19]:
# train Test Splitting
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, Y, test_size=0.3, random_state=2)

# Model Training

****Now that preprocessing is out of the way, it is now time to train the model, test it and calibrate. ****

In [20]:
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import RMSprop
# Model Definition 
def tensorflow_based_model(): #Defined tensorflow_based_model function for training tenforflow based model
    inputs = Input(name='inputs',shape=[max_len])#step1
    layer = Embedding(2000,50,input_length=max_len)(inputs) #step2
    layer = LSTM(64)(layer) #step3
    layer = Dense(256,name='FC1')(layer) #step4
    layer = Activation('relu')(layer) # step5
    layer = Dropout(0.5)(layer) # step6
    layer = Dense(1,name='out_layer')(layer) #step4 again but this time its giving only one output as because we need to classify the tweet as positive or negative
    layer = Activation('sigmoid')(layer) #step5 but this time activation function is sigmoid for only one output.
    model = Model(inputs=inputs,outputs=layer) #here we are getting the final output value in the model for classification
    return model #function returning the value when we call it

In [21]:
model = tensorflow_based_model()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(),metrics=['accuracy'])  

In [22]:
history=model.fit(X_train,Y_train,batch_size=80,epochs=8, validation_split=0.1)
print("done training")

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8
done training


In [23]:
accr1 = model.evaluate(X_test,Y_test)
print('Test set\n  Accuracy: {:0.2f}'.format(accr1[1]))

Test set
  Accuracy: 0.74


In [26]:
model.save('')



INFO:tensorflow:Assets written to: assets


INFO:tensorflow:Assets written to: assets


In [25]:
sequences_matrix.type

AttributeError: 'numpy.ndarray' object has no attribute 'type'

In [30]:
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stopwords_list])

def remove_punct(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

def remove_URLs(text):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',text)

def remove_numeric(text):
    return  re.sub('[0-9]+', '', text)

def tokenize_message(text):
    words = word_tokenize(text)
    return text.split()

def lemmatize_message(tokens):
    data = [lemmatizer.lemmatize(word) for word in tokens ]
    return data

lemmatizer = WordNetLemmatizer()
punctuations_list = string.punctuation
stopwords_list = stopwords.words('english')

feedback='very bad terrible product. buggy product needs a lot of fixes. incredibly bad and terrible product. the latest update completely broke the entire app and made it unusable'

data =  pd.DataFrame({'message': [feedback]})
data['message'].loc[0].lower()

re.sub(r'(.)\1+', r'\1', data['message'].loc[0])
data['message'].loc[0] = remove_stopwords(data['message'].loc[0])
data['message'].loc[0] = remove_punct(data['message'].loc[0])
data['message'].loc[0] = remove_URLs(data['message'].loc[0])
data['message'].loc[0] = remove_numeric(data['message'].loc[0])
data['message'].loc[0] = tokenize_message(data['message'].loc[0])
data['message'].loc[0] = lemmatize_message(data['message'].loc[0])
X= data.message

max_len = 500
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)
sequences_matrix.shape

(1, 500)

In [31]:
model.predict(sequences_matrix)

array([[0.43569538]], dtype=float32)