In [None]:
!pip install neattext

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting neattext
  Downloading neattext-0.1.3-py3-none-any.whl (114 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m114.7/114.7 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: neattext
Successfully installed neattext-0.1.3


#Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import neattext.functions as nfx
from google.colab import drive
import matplotlib.pyplot as plt
from textblob import TextBlob
import re
import string

#Mount Google Drive


In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


#Define a function to handle bad lines in the CSV file

In [None]:
def bad_line(x):
  print(x)
  return None

#Load the dataset

In [None]:
df_data = pd.read_csv('/content/drive/Othercomputers/My Laptop (1)/year4/final_project/data_science/data_set/final_combined.csv', on_bad_lines=bad_line, engine='python')
print(df_data.shape)

(1529906, 22)


#Removing all the rows with non english tweets

In [None]:
df_data = df_data[df_data['Language'] == 'en']
print(df_data.shape)

(1150394, 22)


#Define a function to clean the text

In [None]:
def clean_text(text):
  # Remove URLs
  text = re.sub(r'http\S+', '', text)
  # Remove mentions (@)
  text = re.sub(r'@\w+', '', text)
  # Remove hashtags (#)
  text = re.sub(r'#\w+', '', text)
  # Remove punctuation
  text = text.translate(str.maketrans('', '', string.punctuation))
  # Convert to lowercase
  text = text.lower()
  # Remove extra spaces
  text = re.sub(r'\s+', ' ', text).strip()
  return text

#Clean the 'Tweet' column and create a new column 'clean_tweet'

In [None]:
df_data['clean_tweet'] = df_data['Tweet'].apply(clean_text)
# Remove white spaces and new line characters from 'clean_tweet'
df_data['clean_tweet'] = df_data['clean_tweet'].apply(nfx.remove_multiple_spaces)

#Define a function to calculate subjectivity of a tweet

In [None]:
def get_subjectivity(tweet):
  return TextBlob(tweet).sentiment.subjectivity

#Define a function to calculate polarity of a tweet

In [None]:
def get_polarity(tweet):
  return TextBlob(tweet).sentiment.polarity
print(df_data.shape)

(1150394, 23)


#Create two columns for subjectivity and polarity

In [None]:
df_data['subjectivity'] = df_data['clean_tweet'].apply(get_subjectivity)
df_data['polarity'] = df_data['clean_tweet'].apply(get_polarity)

#Define a function to get the sentiment of a tweet based on its polarity score

In [None]:
def getSentiment(score):
  if (score < 0 ):
    return 'negative'
  elif (score == 0):
    return 'neutral'
  else:
    return 'positive'

#Create a new column 'sentiment' to store the sentiment of each tweet

In [None]:
df_data['sentiment'] = df_data['polarity'].apply(getSentiment)

In [None]:
df_data.head(2)

Unnamed: 0,Tweet,Date,time,Day of week,Cashtags,Hashtags,Language,Location,Mentioned_users,Followers,...,Average_favourite_count,account_age,Likes,Comments,Retweets,Views,clean_tweet,subjectivity,polarity,sentiment
0,@RockNRoLL_85 Apples &amp; oranges. Eddie's g...,2023-03-25 23:59:58,23:59:58,Saturday,,,en,False,True,220.0,...,1.559077,11.0,0.0,0.0,0.0,9.0,apples amp oranges eddies guitar riffs are the...,0.9,0.6,positive
1,I trust Apple weather app with my life and I j...,2023-03-25 23:59:55,23:59:55,Saturday,,,en,True,False,168.0,...,0.177647,9.0,0.0,0.0,0.0,108.0,i trust apple weather app with my life and i j...,0.691667,0.05,positive


In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Calculate accuracy, precision, recall, and F1-score for TextBlob sentiment analysis
tb_accuracy = accuracy_score(df_data['sentiment_tb'], np.where(df_data['polarity_tb']>0, 'positive', np.where(df_data['polarity_tb']<0, 'negative', 'neutral')))
tb_precision = precision_score(df_data['sentiment_tb'], np.where(df_data['polarity_tb']>0, 'positive', np.where(df_data['polarity_tb']<0, 'negative', 'neutral')), average='weighted')
tb_recall = recall_score(df_data['sentiment_tb'], np.where(df_data['polarity_tb']>0, 'positive', np.where(df_data['polarity_tb']<0, 'negative', 'neutral')), average='weighted')
tb_f1 = f1_score(df_data['sentiment_tb'], np.where(df_data['polarity_tb']>0, 'positive', np.where(df_data['polarity_tb']<0, 'negative', 'neutral')), average='weighted')

print("TextBlob Accuracy:", tb_accuracy)
print("TextBlob Precision:", tb_precision)
print("TextBlob Recall:", tb_recall)
print("TextBlob F1-score:", tb_f1)



TextBlob Accuracy: 1.0
TextBlob Precision: 1.0
TextBlob Recall: 1.0
TextBlob F1-score: 1.0


#Save the dataset 

In [None]:
# save the DataFrame to a CSV file
df_data.to_csv('data_sentiment_final.csv', index=False)