In [1]:
# installing kaggle library

! pip install kaggle



In [2]:
# configuring the path of kaggle.json file

! mkdir -p ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json

The syntax of the command is incorrect.
'cp' is not recognized as an internal or external command,
operable program or batch file.
'chmod' is not recognized as an internal or external command,
operable program or batch file.


In [None]:
# API to fetch the dataset from kaggle

! kaggle datasets download -d kazanova/sentiment140

In [None]:
# extracting the zip file

from zipfile import ZipFile
dataset = '/content/sentiment140.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

# **Importing Dependencies**

In [None]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# printing stopwords in English
# stopwords are those words which does not add any meaning to the model
# its usecase here is to remove such words from our dataset which are not required for analysis to reduce complexity

print(stopwords.words('english'))

# **Data Collection & Processing**

In [None]:
# loding data through pandas dataframe

social_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', encoding = 'ISO-8859-1')

In [None]:
# checking the number of rows and cols

social_data.shape

In [None]:
# printing first five rows of dataframe

social_data.head()

In [None]:
# naming cols and reading dataset again
# earlier the first post was read as col name
# therefore it was showing 1599999 rows
# now after giving col names it's

col_name = ['target','id', 'date', 'user','flag','text']
social_data = pd.read_csv('/content/training.1600000.processed.noemoticon.csv', names = col_name, encoding = 'ISO-8859-1')
social_data.shape

In [None]:
# printing first five rows of dataframe

social_data.head()

In [None]:
# counting missing values
# so that we can drop them

social_data.isnull().sum()

In [None]:
# converting 4 to 1

# annotation in out dataset :-
# 0 -> negative
# 4 -> positive

# inplace = true, so that changes can be made in original dataset

social_data.replace({'target':{4:1}}, inplace=True)

In [None]:
# checking distribution of target cols

social_data['target'].value_counts()



```
0 --> Negative
1 --> Positive
```



# **Stemming**

*Reducing the word to its word root so that Model can analyse it efficiently*

In [None]:
# loading the instance of the porter stemmer ()

port_stem = PorterStemmer()

In [None]:
# defining the stemming function

def stemming(content):

  stemmed_content = re.sub('[^a-zA-Z]',' ',content) # every character apart from a-z A-Z will be removed
  stemmed_content = stemmed_content.lower()         # converting to lower case
  stemmed_content = stemmed_content.split()         # splitting the words

  # if word is not in stop words stemming will take place else not
  stemmed_content = [port_stem.stem(word) for word in stemmed_content if not word in stopwords.words('english')]

  # combining all the root words again after splitting
  stemmed_content = ' '.join(stemmed_content)

  return stemmed_content

In [None]:
# applying the function on the data

social_data['stemmed_content'] = social_data['text'].apply(stemming)

In [None]:
# printing the stemmed content

print(social_data['stemmed_content'])

In [None]:
print(social_data['target'])

In [None]:
# separating the data and label

x = social_data['stemmed_content'].values
y = social_data['target'].values

In [None]:
print(x)

In [None]:
print(y)

# **Splitting Data to training data and test data**

In [None]:
# train_test_split()  : splits data into training and test data
# x_train and y_train : store the training data & corresponding label
# x_test and y_test   : store the testing data & corresponding label
# test_size = 0.2     : 20% data for testing 80% data for training
# stratify = y        : ensure equal proportion of 0 & 1 label
# random_state        : splitting the data randomly

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, stratify=y, random_state=2)

In [None]:
# 20% of 1600000 = 320000
# 80% of 1600000 = 1280000

print(x.shape, x_train.shape, x_test.shape)

In [None]:
print(x_train)

In [None]:
print(x_test)

In [None]:
# convert text data into numerical data so that ML model understand
# TfidfVectorizer : assigns the importance to each individual word

vectorizer = TfidfVectorizer()

# fit_transform : learns from the data and transorm values
# transform     : only transform the values

x_train = vectorizer.fit_transform(x_train)
x_test = vectorizer.transform(x_test)

In [None]:
print(x_train)

In [None]:
print(x_test)

# **Training the ML Model**

*Logistic Regression*
- classification model used to classify data points to different classes

In [None]:
# Train the Logistic Regression model

model = LogisticRegression(max_iter=1000)
model.fit(x_train, y_train)


In [None]:
import pickle

In [None]:
# Save the TfidfVectorizer
# so that we can reuse it later
# without retraining it

with open('tfidf_vectorizer.pkl', 'wb') as f: # wb : write binary
    pickle.dump(vectorizer, f)

In [None]:
# Save the trained model

filename = 'trained_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [None]:
# Load the trained model

loaded_model = pickle.load(open('trained_model.sav', 'rb'))

In [None]:
# Load the TfidfVectorizer

vectorizer = pickle.load(open('tfidf_vectorizer.pkl', 'rb'))

In [None]:
# Function to predict sentiment of a single input sentence

def predict_sentiment(sentence):
    preprocessed_sentence = stemming(sentence)
    transformed_sentence = vectorizer.transform([preprocessed_sentence])
    prediction = loaded_model.predict(transformed_sentence)
    if prediction[0] == 0:
        return 'Negative'
    else:
        return 'Positive'

In [None]:
# Example usage

my_sentence = "I love this product! It's amazing."
print(f'Sentence: "{my_sentence}" -> Sentiment: {predict_sentiment(my_sentence)}')

In [None]:
# Example usage

my_sentence = "This is the worst service I've ever had."
print(f'Sentence: "{my_sentence}" -> Sentiment: {predict_sentiment(my_sentence)}')

## **Model Evaluation**

*Accuracy Score*

In [None]:
# accuracy score on training data
# predict() will tell wheter the data is postive or negitive
# accuracy() will test it on the predictions made and tell accuracy

x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(y_train, x_train_prediction)

In [None]:
print('accuracy score on training data : ',training_data_accuracy)

In [None]:
# accuracy score on test data

x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(y_test, x_test_prediction)

In [None]:
print('accuracy score on test data : ',test_data_accuracy)

***Training Data Score : 81%***

***Testing  Data Score : 77%***