# LIBRARIES

In [1]:
import pandas as pd
import csv
import re
import numpy as np
import pickle
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS

# METHODS


In [2]:
def importDataset(fileName, header = None):
    dataset = pd.read_csv(fileName, encoding = 'ISO-8859-1', header = header)
    dataset.columns = ['sentiment','id','date','flag','user','text']
    for i in ['flag','id','user','date']: del dataset[i]
    dataset.sentiment = dataset.sentiment.replace(4,1)
    return dataset

def preprocessingForTweetPart(text):
        text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL',text)
        text = re.sub('@[^\s]+','USERNAME', text)
        text = re.sub(r'#([^\s]+)', r'\1', text)
        return text
    
def NegativeOrPositive(userInput):
    input1 = preprocessingForTweetPart(userInput)
    input1 = np.array([userInput])
    vector = tfv.transform(input1)
    sentiment = classifier.predict(vector)
    if(sentiment == 0):
        print("NEGATIVE")
        print('\U0001F614')
    else:
        print("POSITIVE")
        print('\U0001F600')
            

# CVS FILE IMPORT 

In [3]:
dataset = importDataset("dataset.csv")

In [4]:
dataset.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [5]:
dataset.tail()

Unnamed: 0,sentiment,text
1599995,1,Just woke up. Having no school is the best fee...
1599996,1,TheWDB.com - Very cool to hear old Walt interv...
1599997,1,Are you ready for your MoJo Makeover? Ask me f...
1599998,1,Happy 38th Birthday to my boo of alll time!!! ...
1599999,1,happy #charitytuesday @theNSPCC @SparksCharity...


# PREPROCESSING FOR @username, hashtags and URL

In [6]:
dataset['text'] = dataset['text'].apply(preprocessingForTweetPart)
data = np.array(dataset['text'])
label = np.array(dataset.sentiment)

In [7]:
dataset.head()

Unnamed: 0,sentiment,text
0,0,"USERNAME URL - Awww, that's a bummer. You sho..."
1,0,is upset that he can't update his Facebook by ...
2,0,USERNAME I dived many times for the ball. Mana...
3,0,my whole body feels itchy and like its on fire
4,0,"USERNAME no, it's not behaving at all. i'm mad..."


# FEATURES EXTRACTION

In [8]:
tfv = TfidfVectorizer(sublinear_tf=True, stop_words = "english") 
features = tfv.fit_transform(data)

In [9]:
dataset.head()

Unnamed: 0,sentiment,text
0,0,"USERNAME URL - Awww, that's a bummer. You sho..."
1,0,is upset that he can't update his Facebook by ...
2,0,USERNAME I dived many times for the ball. Mana...
3,0,my whole body feels itchy and like its on fire
4,0,"USERNAME no, it's not behaving at all. i'm mad..."


# REMOVE PUNCTUATIONS

In [10]:
string.punctuation
def removePunctuation(text):
    textNoPunct = "".join([i for i in text if i not in string.punctuation])
    return textNoPunct

In [11]:
dataset['Text No Punctuation'] = dataset['text'].apply(lambda x: removePunctuation(x))

In [12]:
dataset.head()

Unnamed: 0,sentiment,text,Text No Punctuation
0,0,"USERNAME URL - Awww, that's a bummer. You sho...",USERNAME URL Awww thats a bummer You shoulda...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...
2,0,USERNAME I dived many times for the ball. Mana...,USERNAME I dived many times for the ball Manag...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"USERNAME no, it's not behaving at all. i'm mad...",USERNAME no its not behaving at all im mad why...


# REMOVE NUMBERS AND LOWER CASE PART

In [13]:
def lowerText(text):
    text = text.lower()
    text = re.sub("\d+","",text)
    return text

dataset['All Lower Case'] = dataset['Text No Punctuation'].apply(lambda x: lowerText(x))

In [14]:
dataset.head()

Unnamed: 0,sentiment,text,Text No Punctuation,All Lower Case
0,0,"USERNAME URL - Awww, that's a bummer. You sho...",USERNAME URL Awww thats a bummer You shoulda...,username url awww thats a bummer you shoulda...
1,0,is upset that he can't update his Facebook by ...,is upset that he cant update his Facebook by t...,is upset that he cant update his facebook by t...
2,0,USERNAME I dived many times for the ball. Mana...,USERNAME I dived many times for the ball Manag...,username i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire
4,0,"USERNAME no, it's not behaving at all. i'm mad...",USERNAME no its not behaving at all im mad why...,username no its not behaving at all im mad why...


# REMOVE UNNECESSARY COLUMNS

In [15]:
dataset.drop(columns=['text','Text No Punctuation'],inplace =True)
dataset.head()

Unnamed: 0,sentiment,All Lower Case
0,0,username url awww thats a bummer you shoulda...
1,0,is upset that he cant update his facebook by t...
2,0,username i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire
4,0,username no its not behaving at all im mad why...


# RENAME FOR LAST VERSION OF DATA

In [16]:
dataset.rename(columns= {'All Lower Case':'Tweet'},inplace =True)
dataset.head()

Unnamed: 0,sentiment,Tweet
0,0,username url awww thats a bummer you shoulda...
1,0,is upset that he cant update his facebook by t...
2,0,username i dived many times for the ball manag...
3,0,my whole body feels itchy and like its on fire
4,0,username no its not behaving at all im mad why...


# TOKENIZING

In [17]:
nlp = English()

def tokenize(text):
    myFile = nlp(text)
   
    tokenList = [token.text for token in myFile]
        
    return tokenList

dataset['Tokens'] = dataset['Tweet'].apply(lambda x: tokenize(x))

# STOPWORDS

In [18]:
def removingStopwords(text):
    listStopwordsRemoved = [i for i in text if nlp.vocab[i].is_stop == False]
    return listStopwordsRemoved
dataset['No Stopwords'] = dataset['Tokens'].apply(lambda x:removingStopwords(x))
dataset.head()

Unnamed: 0,sentiment,Tweet,Tokens,No Stopwords
0,0,username url awww thats a bummer you shoulda...,"[username, url, , awww, that, s, a, bummer, ...","[username, url, , awww, s, bummer, , shoulda..."
1,0,is upset that he cant update his facebook by t...,"[is, upset, that, he, ca, nt, update, his, fac...","[upset, nt, update, facebook, texting, cry, re..."
2,0,username i dived many times for the ball manag...,"[username, i, dived, many, times, for, the, ba...","[username, dived, times, ball, managed, save, ..."
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[body, feels, itchy, like, fire]"
4,0,username no its not behaving at all im mad why...,"[username, no, its, not, behaving, at, all, i,...","[username, behaving, m, mad, nt]"


# LEMMATIZING

In [19]:
nltk.download('wordnet')
wordnet = nltk.WordNetLemmatizer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\MONSTER\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [20]:
def lemmatization(token_text):
    text = [wordnet.lemmatize(word) for word in token_text]
    return text

In [21]:
nltk.download('omw-1.4')
dataset['Lemmatized'] = dataset['No Stopwords'].apply(lambda x:lemmatization(x))
dataset.head()

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\MONSTER\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Unnamed: 0,sentiment,Tweet,Tokens,No Stopwords,Lemmatized
0,0,username url awww thats a bummer you shoulda...,"[username, url, , awww, that, s, a, bummer, ...","[username, url, , awww, s, bummer, , shoulda...","[username, url, , awww, s, bummer, , shoulda..."
1,0,is upset that he cant update his facebook by t...,"[is, upset, that, he, ca, nt, update, his, fac...","[upset, nt, update, facebook, texting, cry, re...","[upset, nt, update, facebook, texting, cry, re..."
2,0,username i dived many times for the ball manag...,"[username, i, dived, many, times, for, the, ba...","[username, dived, times, ball, managed, save, ...","[username, dived, time, ball, managed, save, ..."
3,0,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...","[body, feels, itchy, like, fire]","[body, feel, itchy, like, fire]"
4,0,username no its not behaving at all im mad why...,"[username, no, its, not, behaving, at, all, i,...","[username, behaving, m, mad, nt]","[username, behaving, m, mad, nt]"


# REMOVE UNNECESSARY COLUMNS AGAIN

In [22]:
dataset.drop(columns = ['Tweet','Tokens','No Stopwords'],inplace =True)
dataset.head()

Unnamed: 0,sentiment,Lemmatized
0,0,"[username, url, , awww, s, bummer, , shoulda..."
1,0,"[upset, nt, update, facebook, texting, cry, re..."
2,0,"[username, dived, time, ball, managed, save, ..."
3,0,"[body, feel, itchy, like, fire]"
4,0,"[username, behaving, m, mad, nt]"


In [23]:
def convertString(text):
    string = ' '.join(text)
    return string

In [24]:
dataset['String'] = dataset['Lemmatized'].apply(lambda x: convertString(x))
dataset.drop(columns= 'Lemmatized',inplace=True)
dataset.head()

Unnamed: 0,sentiment,String
0,0,username url awww s bummer shoulda got dav...
1,0,upset nt update facebook texting cry result ...
2,0,username dived time ball managed save rest ...
3,0,body feel itchy like fire
4,0,username behaving m mad nt


# RENAME FOR LAST VERSION OF TWEETS

In [25]:
# renaming
dataset.rename(columns= {'String': 'Tweet'},inplace =True)
dataset.head()

Unnamed: 0,sentiment,Tweet
0,0,username url awww s bummer shoulda got dav...
1,0,upset nt update facebook texting cry result ...
2,0,username dived time ball managed save rest ...
3,0,body feel itchy like fire
4,0,username behaving m mad nt


In [26]:
dataset['sentiment'].value_counts()

0    800000
1    800000
Name: sentiment, dtype: int64

# SPLITING DATA FOR TRAINING AND TESTING

In [27]:
X_train, X_test, y_train, y_test = train_test_split(features, label, test_size=0.2)   

# MULTINOMIAL NAIVE BAYES FIT MODEL TO DATA

In [28]:
model = MultinomialNB()

classifier = model.fit(features, label)

# OPEN AND LOAD CLASSIFIER  

In [29]:
open_classifier = open('Naive_bayes.pickle','wb')
pickle.dump(classifier, open_classifier)
open_classifier.close()

In [30]:
load_classifier = open('Naive_bayes.pickle','rb')
classifier_n = pickle.load(load_classifier)
load_classifier.close()

# PREDICTION TEST AND MODEL ACCURACY SCORE %79

In [31]:
predictions = classifier_n.predict(X_test)
print(predictions)
score = classifier_n.score(X_test, y_test)
print(score)

[0 0 0 ... 0 0 0]
0.79706875


In [32]:
predictions = model.predict(X_test)
print(predictions)
score = model.score(X_test, y_test)
print(score)

[0 0 0 ... 0 0 0]
0.79706875


# PREDICT NEGATIVE OR POSITIVE FOR USER INPUT

In [64]:
i = input("INPUT :")
NegativeOrPositive(i)


INPUT :good things
POSITIVE
😀
