<a href="https://colab.research.google.com/github/farazzashraf/Sentiment-Analysis/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import StandardScaler

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
train_data = pd.read_csv("drive/MyDrive/Sentiment Analysis/train.csv", encoding="ISO-8859-1")
train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB


In [5]:
train_data.describe()

Unnamed: 0,Population -2020,Land Area (Km²),Density (P/Km²)
count,27481.0,27481.0,27481.0
mean,40184970.0,662173.0,357.686583
std,150494600.0,1807425.0,2013.750702
min,801.0,0.0,2.0
25%,1968001.0,22810.0,35.0
50%,8655535.0,111890.0,89.0
75%,28435940.0,527970.0,214.0
max,1439324000.0,16376870.0,26337.0


In [6]:
train_data.columns

Index(['textID', 'text', 'selected_text', 'sentiment', 'Time of Tweet',
       'Age of User', 'Country', 'Population -2020', 'Land Area (Km²)',
       'Density (P/Km²)'],
      dtype='object')

In [7]:
# Check for missing values
print("\nMissing values:")
print(train_data.isna().sum())


Missing values:
textID              0
text                1
selected_text       1
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64


In [8]:
train_data.dropna(inplace=True)

In [9]:
train_data.isna().sum()

textID              0
text                0
selected_text       0
sentiment           0
Time of Tweet       0
Age of User         0
Country             0
Population -2020    0
Land Area (Km²)     0
Density (P/Km²)     0
dtype: int64

In [10]:
train_data.duplicated().sum()

0

In [11]:
# Check the distribution of sentiment labels
print("\nDistribution of sentiment labels:")
print(train_data['sentiment'].value_counts())


Distribution of sentiment labels:
neutral     11117
positive     8582
negative     7781
Name: sentiment, dtype: int64


In [12]:
train_data["text"].head()

0                  I`d have responded, if I were going
1        Sooo SAD I will miss you here in San Diego!!!
2                            my boss is bullying me...
3                       what interview! leave me alone
4     Sons of ****, why couldn`t they put them on t...
Name: text, dtype: object

In [13]:
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
def preprocess_data(data):

  # Preprocessing for the text column
  data['text'] = data['text'].str.lower()

  #Tokeniizing the text
  data['text'] = data['text'].apply(word_tokenize)

  # removing punctuation and special characters
  data['text'] = data['text'].apply(lambda words: [word for word in words if word.isalnum()])

  # removing the stopwrods
  nltk.download("stopwords")
  stop_words = set(stopwords.words('english'))
  data['text'] = data['text'].apply(lambda words: [word for word in words if word not in stop_words])

  stemmer = PorterStemmer()
  data['text'] = data['text'].apply(lambda words: [stemmer.stem(word) for word in words])

  lemmatizer = WordNetLemmatizer()
  data['text'] = data['text'].apply(lambda words: [lemmatizer.lemmatize(word) for word in words])

  return data

In [15]:
preprocessed_train_data = preprocess_data(train_data)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# Display the preprocessed data
print("\nPreprocessed data: ")
print(preprocessed_train_data['text'].head())


Preprocessed data: 
0                          [respond, go]
1          [sooo, sad, miss, san, diego]
2                           [bos, bulli]
3                [interview, leav, alon]
4    [son, put, releas, alreadi, bought]
Name: text, dtype: object


In [17]:
preprocessed_train_data.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"[respond, go]","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,"[sooo, sad, miss, san, diego]",Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,"[bos, bulli]",bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,"[interview, leav, alon]",leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"[son, put, releas, alreadi, bought]","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [18]:
X = preprocessed_train_data['text']
y = preprocessed_train_data["sentiment"]

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [19]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(21984,)
(5496,)
(21984,)
(5496,)


In [20]:
# Convert the tokenized words back to text
X_train_text = [' '.join(words) for words in X_train]
X_test_text = [' '.join(words) for words in X_test]

# Initializing the TF-IDF vectorizer with desired parameters
tfidf_vectorizer = TfidfVectorizer(max_features=1000,
                                   stop_words='english',
                                   ngram_range=(1, 2),
                                   max_df=0.85,
                                   min_df=2)

# Fit and transform the vectorizer on the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_text)

# Transforming the test data using the same vectorizer
X_test_tfidf = tfidf_vectorizer.transform(X_test_text)

scaler = StandardScaler(with_mean=False)
X_train_tfidf_scaled = scaler.fit_transform(X_train_tfidf)
X_test_tfidf_scaled = scaler.transform(X_test_tfidf)

In [21]:
# Building the model

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

model = LogisticRegression(max_iter=1000)

model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)

In [22]:
# Evaluating the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)
classification = classification_report(y_test, y_pred)
print("Classification Report: ", classification)
confusion = confusion_matrix(y_test, y_pred)
print("Confusion Matrix: ", confusion)

Accuracy:  0.6857714701601164
Classification Report:                precision    recall  f1-score   support

    negative       0.71      0.56      0.62      1572
     neutral       0.63      0.75      0.68      2236
    positive       0.77      0.72      0.74      1688

    accuracy                           0.69      5496
   macro avg       0.70      0.68      0.68      5496
weighted avg       0.69      0.69      0.68      5496

Confusion Matrix:  [[ 877  593  102]
 [ 290 1685  261]
 [  71  410 1207]]


In [23]:
test_data = pd.read_csv("drive/MyDrive/Sentiment Analysis/test.csv", encoding="ISO-8859-1")

test_data.dropna(inplace=True)

test_data = preprocess_data(test_data)

test_data['text'] = test_data['text'].apply(' '.join)

X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])

# Make predictions using the trained model
y_pred = model.predict(X_test_tfidf)

# Load the true labels of the test data (if available)
true_labels = test_data['sentiment']  # Replace 'sentiment' with the actual column name

# Calculate accuracy
accuracy = accuracy_score(true_labels, y_pred)

# Generate classification report
classification_rep = classification_report(true_labels, y_pred)

# Generate confusion matrix
confusion_mat = confusion_matrix(true_labels, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_rep)
print("Confusion Matrix:")
print(confusion_mat)


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Accuracy: 0.7059988681380871
Classification Report:
              precision    recall  f1-score   support

    negative       0.72      0.63      0.67      1001
     neutral       0.65      0.75      0.70      1430
    positive       0.79      0.71      0.75      1103

    accuracy                           0.71      3534
   macro avg       0.72      0.70      0.71      3534
weighted avg       0.71      0.71      0.71      3534

Confusion Matrix:
[[ 629  324   48]
 [ 195 1078  157]
 [  46  269  788]]


In [24]:
test_data['predicted_sentiment'] = y_pred

test_data.to_csv("drive/MyDrive/Sentiment Analysis/predicted_test_file.csv", index=False)

In [25]:
# Assuming you have true sentiment labels in a 'sentiment' column in your test_data DataFrame
true_sentiments = test_data['sentiment']

# Make predictions using the trained model
y_pred = model.predict(X_test_tfidf)

# Create a DataFrame that includes the text, true sentiment, and predicted sentiment
results = pd.DataFrame({'Text': test_data['text'], 'True Sentiment': true_sentiments, 'Predicted Sentiment': y_pred})

# Filter texts with positive predictions
positive_texts = results[results['Predicted Sentiment'] == 'positive']

# Filter texts with negative predictions
negative_texts = results[results['Predicted Sentiment'] == 'negative']

# Filter texts with neutral predictions
neutral_texts = results[results['Predicted Sentiment'] == 'neutral']

# Print a few examples from each category
print("Examples of texts with positive predictions:")
print(positive_texts['Text'].head())

print("\nExamples of texts with neutral predictions:")
print(neutral_texts['Text'].head())

print("\nExamples of texts with negative predictions:")
print(negative_texts['Text'].head())



Examples of texts with positive predictions:
1     shanghai also realli excit precis skyscrap gal...
3                                            happi bday
5                                    great weee visitor
18       guy say hi answer question yesterday nice song
19       go spiritu stagnent explod ego realis great ok
Name: Text, dtype: object

Examples of texts with neutral predictions:
0                                last session day http
2    recess hit veroniqu branquinho quit compani shame
4                                            http like
8                          within short time last clue
9    get day alright done anyth yet leav soon steps...
Name: Text, dtype: object

Examples of texts with negative predictions:
6                                think everyon hate lol
7     soooooo wish could im school myspac complet block
12                             twitter tavern bore much
13    va weekend youngest son turn 2 tomorrow make k...
14          come socket feel like

In [26]:
def preprocess_user_input(user_input):
    user_input = user_input.lower()
    user_input = word_tokenize(user_input)
    user_input = [word for word in user_input if word.isalnum()]
    nltk.download("stopwords")
    stop_words = set(stopwords.words('english'))
    user_input = [word for word in user_input if word not in stop_words]
    stemmer = PorterStemmer()
    user_input = [stemmer.stem(word) for word in user_input]
    lemmatizer = WordNetLemmatizer()
    user_input = [lemmatizer.lemmatize(word) for word in user_input]
    return ' '.join(user_input)

while True:
    user_input = input("Enter a text (or type 'exit' to quit): ")

    if user_input == 'exit':
        break

    # Preprocess user input
    user_input = preprocess_user_input(user_input)

    # Vectorize the user input using the same TF-IDF vectorizer
    user_input_tfidf = tfidf_vectorizer.transform([user_input])

    # Predict the sentiment of the user input
    sentiment = model.predict(user_input_tfidf)[0]

    print("Predicted sentiment:", sentiment)

Enter a text (or type 'exit' to quit): happy bday


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predicted sentiment: positive
Enter a text (or type 'exit' to quit): last session day


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predicted sentiment: neutral
Enter a text (or type 'exit' to quit): soo wish i am in school


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predicted sentiment: positive
Enter a text (or type 'exit' to quit): i don't like


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predicted sentiment: positive
Enter a text (or type 'exit' to quit): everyone hate me


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Predicted sentiment: negative
Enter a text (or type 'exit' to quit): exit
