<a href="https://colab.research.google.com/github/divyaag123/Twitter-Sentiment-analysis/blob/main/TwitterSEntiment_analysis_4year_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#installing kaggle library
!pip install kaggle



In [None]:
#configure the path of kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [None]:

# API to fetch the dataset from kaggle
!kaggle datasets download -d kazanova/sentiment140

sentiment140.zip: Skipping, found more recently modified local copy (use --force to force download)


In [None]:
#extracting the compressed dataset

from zipfile import ZipFile
dataset = '/content/sentiment140.zip'


with ZipFile(dataset , 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [None]:
#importing the dependencies
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import nltk
nltk.download('stopwords')

In [None]:
#printing stopwords in english
print(stopwords.words('english'))

Data Processing


In [None]:
#loading the data from csv file to pandas dataframe
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv' , encoding = 'ISO-8859-1')

In [None]:
# checking the number of rows and columns
twitter_data.shape

In [None]:
#printing the first five rows of the dataframe
twitter_data.head()

In [None]:
# naming the columns and reading the datset again


column_names = ['target' , 'id' , 'date' , 'flag' ,'user' , 'text']
twitter_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv' , names = column_names ,encoding = 'ISO-8859-1')



In [None]:
twitter_data.shape

In [None]:
twitter_data.head()

In [None]:
# counting the no. of missing values in the dataset , replacing the missing values
twitter_data.isnull().sum()

In [None]:
# understanding the distributtion of target variable
# no. of positive and negative tweets
twitter_data['target'].value_counts()

In [None]:
#so we can see that we have equal distribution
#let's convert the 4 value into 1 , 1 means positive
# all this comes under data preprocessing
twitter_data.replace({'target' : {4:1}} , inplace = True)

In [None]:
twitter_data.replace({'target' : {4: 1} }, inplace = True)

In [None]:
twitter_data['target'].value_counts()

In [None]:
# 0 ---> negative Tweet
# 1 ----> positive tweet


In [None]:
# we are going to use the porter stemer function

Stemming ----> it is the process of reducing a word to its root word

In [None]:
port_stem = PorterStemmer()

In [None]:
def stemming(content):

  stemmed_content = re.sub('[^a-zA-Z]' ,' ', content)#anything except the alphabets are removed
  stemmed_content = stemmed_content.lower()#converting to lowercase
  stemmed_content = stemmed_content.split()#spliting all the words in a list and putting them into a tweet
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]# ignoring the stopwords
  stemmed_content = ' '.join(stemmed_content) #combining the tweet

  return stemmed_content


In [None]:
twitter_data['stemmed_content'] = twitter_data['text'].apply(stemming) # 50 min to complete the execution

In [None]:
twitter_data.head()

Unnamed: 0,target,id,date,flag,user,text,stemmed_content
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",switchfoot http twitpic com zl awww bummer sho...
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...,upset updat facebook text might cri result sch...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...,kenichan dive mani time ball manag save rest g...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire,whole bodi feel itchi like fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all....",nationwideclass behav mad see


In [None]:
print(twitter_data['stemmed_content'])

0          switchfoot http twitpic com zl awww bummer sho...
1          upset updat facebook text might cri result sch...
2          kenichan dive mani time ball manag save rest g...
3                            whole bodi feel itchi like fire
4                              nationwideclass behav mad see
                                 ...                        
1599995                           woke school best feel ever
1599996    thewdb com cool hear old walt interview http b...
1599997                         readi mojo makeov ask detail
1599998    happi th birthday boo alll time tupac amaru sh...
1599999    happi charitytuesday thenspcc sparkschar speak...
Name: stemmed_content, Length: 1600000, dtype: object


In [None]:
print(twitter_data['target'])

0          0
1          0
2          0
3          0
4          0
          ..
1599995    1
1599996    1
1599997    1
1599998    1
1599999    1
Name: target, Length: 1600000, dtype: int64


In [None]:
# model is trying to understand between 1 and positve tweet and 0 and negative tweet
# separating the data and the label(target)
X= twitter_data['stemmed_content'].values
Y = twitter_data['target'].values

In [None]:
print(X)

['switchfoot http twitpic com zl awww bummer shoulda got david carr third day'
 'upset updat facebook text might cri result school today also blah'
 'kenichan dive mani time ball manag save rest go bound' ...
 'readi mojo makeov ask detail'
 'happi th birthday boo alll time tupac amaru shakur'
 'happi charitytuesday thenspcc sparkschar speakinguph h']


In [None]:
print(Y)

[0 0 0 ... 1 1 1]


In [None]:
# spliting data into training and test data
X_train , X_test , Y_train , Y_test = train_test_split(X , Y , test_size = 0.2 , stratify = Y , random_state = 2)

In [None]:
print(X.shape , X_train.shape , X_test.shape )

(1600000,) (1280000, 461488) (320000,)


In [None]:
print(X_train  )

  (0, 443066)	0.4484755317023172
  (0, 235045)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185193)	0.5277679060576009
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
  (1, 160636)	1.0
  (2, 288470)	0.16786949597862733
  (2, 132311)	0.2028971570399794
  (2, 150715)	0.18803850583207948
  (2, 178061)	0.1619010109445149
  (2, 409143)	0.15169282335109835
  (2, 266729)	0.24123230668976975
  (2, 443430)	0.3348599670252845
  (2, 77929)	0.31284080750346344
  (2, 433560)	0.3296595898028565
  (2, 406399)	0.32105459490875526
  (2, 129411)	0.29074192727957143
  (2, 407301)	0.18709338684973031
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (3, 172421)	0.37464146922154384
  (3, 411528)	0.27089772444087873
  (3, 388626)	0.3940776331458846
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 390130)	0.22064742191076112
  (1279996, 434014)	0.2718945052332447
  (1279996, 318303)	0.21254698865277746
  (1279996, 237899)	0.2236567560099234
  (1279996, 2910

In [None]:
print(X_test)

['mmangen fine much time chat twitter hubbi back summer amp tend domin free time'
 'ah may show w ruth kim amp geoffrey sanhueza'
 'ishatara mayb bay area thang dammit' ...
 'destini nevertheless hooray member wonder safe trip' 'feel well'
 'supersandro thank']


Feature Extraction or vectorisation ----> converting the textual data to numerical data

In [None]:
vectorizer = TfidfVectorizer()

X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

AttributeError: ignored

In [None]:
print(X_train)

  (0, 443066)	0.4484755317023172
  (0, 235045)	0.41996827700291095
  (0, 109306)	0.3753708587402299
  (0, 185193)	0.5277679060576009
  (0, 354543)	0.3588091611460021
  (0, 436713)	0.27259876264838384
  (1, 160636)	1.0
  (2, 288470)	0.16786949597862733
  (2, 132311)	0.2028971570399794
  (2, 150715)	0.18803850583207948
  (2, 178061)	0.1619010109445149
  (2, 409143)	0.15169282335109835
  (2, 266729)	0.24123230668976975
  (2, 443430)	0.3348599670252845
  (2, 77929)	0.31284080750346344
  (2, 433560)	0.3296595898028565
  (2, 406399)	0.32105459490875526
  (2, 129411)	0.29074192727957143
  (2, 407301)	0.18709338684973031
  (2, 124484)	0.1892155960801415
  (2, 109306)	0.4591176413728317
  (3, 172421)	0.37464146922154384
  (3, 411528)	0.27089772444087873
  (3, 388626)	0.3940776331458846
  (3, 56476)	0.5200465453608686
  :	:
  (1279996, 390130)	0.22064742191076112
  (1279996, 434014)	0.2718945052332447
  (1279996, 318303)	0.21254698865277746
  (1279996, 237899)	0.2236567560099234
  (1279996, 2910

In [None]:
print(X_test)

['mmangen fine much time chat twitter hubbi back summer amp tend domin free time'
 'ah may show w ruth kim amp geoffrey sanhueza'
 'ishatara mayb bay area thang dammit' ...
 'destini nevertheless hooray member wonder safe trip' 'feel well'
 'supersandro thank']


Traing the machine learning model ---> Logistic Regression


In [None]:
model = LogisticRegression(max_iter = 1000)

In [None]:
model.fit(X_train , Y_train)

Model evaluation
Accuracy Score

In [None]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(Y_train , X_train_prediction)


In [None]:
print('Accuracy score on the training data : ', training_data_accuracy)

Accuracy score on the training data :  0.81018984375


In [None]:
# accuracy score on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(Y_test , X_test_prediction)

ValueError: ignored

In [None]:
print('Accuracy score on the tset data: ' , test_data_accuracy)

In [None]:
#there are cases of overfitting where training data accuracy is more and test data accuracy is less




Model Test Accuarcy is 77.8%

Saving the trained model

In [None]:
import pickle

In [None]:
filename = 'trained_model.sav'
pickle.dump(model , open(filename , 'wb'))

Using the saved the model for future predictions

In [None]:
#loading the saved model
loaded_model = pickle.load(open('' , 'rb'))

In [None]:
X_new = X_test[200]
print(Y_test[200])

prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
  print('Negative Tweet')
else:
  print('Positve Tweet')

In [None]:
X_new = X_test[3]
print(Y_test[3])

prediction = model.predict(X_new)
print(prediction)

if(prediction[0] == 0):
  print('Negative Tweet')
else:
  print('Positve Tweet')