<a href="https://colab.research.google.com/github/dipakcodes/Twitter_Sentiment_Analysis/blob/main/Copy_of_LogisticRegression_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
#print the stopwords in English
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [6]:
#DATA PROCESSING
path = "//content/drive/MyDrive/updated_merged.csv"

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
df = pd.read_csv(path)
df.head()

Unnamed: 0,Comment,Sentiment
0,when modi promised “minimum government maximum...,negative
1,talk all the nonsense and continue all the dra...,neutral
2,what did just say vote for modi welcome bjp t...,positive
3,asking his supporters prefix chowkidar their n...,positive
4,answer who among these the most powerful world...,positive


In [9]:
df.shape

(168367, 2)

In [10]:
df.describe()

Unnamed: 0,Comment,Sentiment
count,168367,168367
unique,168084,3
top,❤,positive
freq,24,75068


In [11]:
 df['Sentiment'].value_counts()

Sentiment
positive    75068
neutral     56554
negative    36745
Name: count, dtype: int64

Stemming is the process of reducing a word to its Root word

In [12]:
port_stem = PorterStemmer()


In [13]:
def stemming(content):

  stemmed_content = re.sub('[^a-zA-Z]',' ',content)
  stemmed_content = stemmed_content.lower()
  stemmed_content = stemmed_content.split()
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [15]:
df['stemmed_content'] = df['Comment'].apply(stemming)

In [16]:
df.head()

Unnamed: 0,Comment,Sentiment,stemmed_content
0,when modi promised “minimum government maximum...,negative,modi promis minimum govern maximum govern expe...
1,talk all the nonsense and continue all the dra...,neutral,talk nonsens continu drama vote modi
2,what did just say vote for modi welcome bjp t...,positive,say vote modi welcom bjp told rahul main campa...
3,asking his supporters prefix chowkidar their n...,positive,ask support prefix chowkidar name modi great s...
4,answer who among these the most powerful world...,positive,answer among power world leader today trump pu...


In [17]:
print(df['stemmed_content'])

0         modi promis minimum govern maximum govern expe...
1                      talk nonsens continu drama vote modi
2         say vote modi welcom bjp told rahul main campa...
3         ask support prefix chowkidar name modi great s...
4         answer among power world leader today trump pu...
                                ...                        
168362                                    cant stand kamala
168363    independ trump debat kamala mike total confus ...
168364                                          penc hooray
168365        come across distord mind valu mine moral mine
168366    penc god wit greatest side stepper american hi...
Name: stemmed_content, Length: 168367, dtype: object


In [18]:
print(df['Sentiment'])

0         negative
1          neutral
2         positive
3         positive
4         positive
            ...   
168362    negative
168363    negative
168364    negative
168365    negative
168366    positive
Name: Sentiment, Length: 168367, dtype: object


In [19]:
#Separating the data and label
X = df['stemmed_content'].values
Y = df['Sentiment'].values

In [20]:
print(X)

['modi promis minimum govern maximum govern expect begin difficult job reform state take year get justic state busi exit psu templ'
 'talk nonsens continu drama vote modi'
 'say vote modi welcom bjp told rahul main campaign modi think modi relax'
 ... 'penc hooray' 'come across distord mind valu mine moral mine'
 'penc god wit greatest side stepper american histori evangel liar mayb fault hard defend indefens scale trump zero truth penc dive rapidli toward']


In [21]:
print(Y)

['negative' 'neutral' 'positive' ... 'negative' 'negative' 'positive']


Splitting the data to training data and test data

In [26]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [27]:
print(X.shape, X_train.shape, X_test.shape)

(168367,) (134693,) (33674,)


In [28]:
#Converting the textual data to numerical data
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer. transform(X_test)

In [29]:
print(X_train, X_test)

  (0, 65464)	0.574748842593725
  (0, 70817)	0.5669324558161858
  (0, 41427)	0.10283796409235321
  (0, 3935)	0.5810987107374632
  (1, 18200)	0.23307905210552238
  (1, 23053)	0.16885992048628148
  (1, 14605)	0.2367997150895104
  (1, 55966)	0.19875345545438072
  (1, 28316)	0.15760391164505047
  (1, 57333)	0.35054016244498254
  (1, 44373)	0.2763409189857391
  (1, 54598)	0.2109819160823719
  (1, 72192)	0.11536962992859981
  (1, 36295)	0.14090582164914883
  (1, 54767)	0.2963923855980684
  (1, 30483)	0.23283074026805997
  (1, 53779)	0.19087241905651198
  (1, 20854)	0.2397846443019729
  (1, 18201)	0.32403356538644845
  (1, 65980)	0.13664893150907248
  (1, 69223)	0.3388108063923076
  (1, 6503)	0.14045588728259098
  (1, 71783)	0.1325116608618517
  (1, 29780)	0.08344281613580251
  (1, 41427)	0.033509225319167384
  :	:
  (134691, 23193)	0.5177050070248178
  (134691, 1633)	0.4534457871352209
  (134691, 10265)	0.2470124301739805
  (134691, 37864)	0.25785226060615685
  (134691, 67162)	0.2451972946136

Training the Machine Learning Model

In [30]:
model = LogisticRegression(max_iter = 1000,random_state = 2)


In [31]:
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [32]:
#Accuracy_score on the training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train, X_train_prediction)

In [33]:
print('Accuracy score on the training data: ', training_data_accuracy)

Accuracy score on the training data:  0.8711588575501326


In [34]:
#Accuracy_score on the training data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test, X_test_prediction)

In [35]:
print('Accuracy score on the test data: ', test_data_accuracy)

Accuracy score on the test data:  0.8377977074300648


Saving the trained model

In [36]:
import joblib

In [37]:
filename = 'Trained_move.joblib'
joblib.dump(model, open(filename, 'wb'))

Using the saved model for future predictions

In [46]:
#Loading the saved model
loaded_model = joblib.load(open('/content/Trained_move.joblib', 'rb'))

In [47]:
X_new = X_test[200]
print(Y_test[200])

prediction = model.predict(X_new)
print(prediction)

if (prediction[0] == 'negative'):
  print('Negative')
elif (prediction[0] == 'positive'):
  print('Positive')
else:
  print('Neutral')

negative
['negative']
Negative
