In [1]:
# Import necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from better_profanity import profanity
from textblob import TextBlob

# Load the data
tweet = 'Political_tweets_data1.csv'
df = pd.read_csv(tweet)
# print(df.head(4))

# Initial data processing
df['Original_Tweet'] = df['Tweet']
df['DateTime'] = df['Date']
df = df.drop('Date', axis=1)
df['DateTime'] = df['DateTime'].astype(str).apply(lambda x: x.split('+')[0])

# Convert 'DateTime' column to datetime format and handle invalid values
df['DateTime'] = pd.to_datetime(df['DateTime'], errors='coerce', format='%Y-%m-%d %H:%M:%S').fillna(pd.Timestamp('1900-01-01'))

# Extract date-related information
df['date'] = df.DateTime.apply(lambda x: x.date())
df['month'] = df.DateTime.apply(lambda x: x.month)
df['year'] = df.DateTime.apply(lambda x: x.year)
df['hour'] = df.DateTime.apply(lambda x: x.hour)

# Ensure NLTK stopwords are downloaded
import nltk
# nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

print("Hello33")
# Define the function to clean tweets
def clean_tweet(tweet):
    # Convert to lower case
    tweet = tweet.lower()
    
    # Censor profanity
    profanity.load_censor_words()
    tweet = profanity.censor(tweet)
    
 
    # - Use raw strings (r'') to define the regular expression. 
    # - Raw strings prevent Python from interpreting '\' as an escape character, making the regex work correctly without warnings.
    
    # Remove Twitter handles
    tweet = re.sub(r'@[^\s]+', '', tweet)
    
    # Remove hashtags
    tweet = re.sub(r'\B#\S+', '', tweet)
    
    # Remove URLs
    tweet = re.sub(r"http\S+", "", tweet)
    
    # Remove non-word characters
    tweet = re.sub(r'\W', ' ', tweet)
    
    # Remove single characters (except 'a' and 'I')
    tweet = re.sub(r'\s+[a-hj-z]\s+', ' ', tweet)
    tweet = re.sub(r'\s+i\s+', ' I ', tweet)
    tweet = re.sub(r'\s+a\s+', ' a ', tweet)
    
    # Replace multiple spaces with a single space
    tweet = re.sub(r'\s+', ' ', tweet, flags=re.I)
    
    # Remove stop words
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])
    
    return tweet

print("Hello72")

# Clean the tweets
df['Tweet'] = df['Tweet'].head(100).apply(clean_tweet) # applied only on 100 to reduce processing time during testing

print("Hello77")

# replace NaN or None values with an empty string directly in the DataFrame 
# (Prevent for Error in TextBlob(tweet))
df['Tweet'] = df['Tweet'].fillna('')


# Define the sentiment objects using TextBlob
sentiment_objects = [TextBlob(tweet) for tweet in df['Tweet']]

print("Hello80")

# Extract polarity values and assign sentiment labels
df['Polarity'] = [tweet.sentiment.polarity for tweet in sentiment_objects]
df['Sentiment'] = df['Polarity'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')

print("Hello84")
# Display the processed DataFrame
df.head(4)


Hello33
Hello72
Hello77
Hello80
Hello84
  Unnamed: 0          User                                              Tweet  \
0          0   AnandPatni8  respected indian citizens namaskaar I original...   
1          1      dhinamum  respected indian citizens namaskaar I original...   
2          2  PrincetonCGI  1 meet filmmaker prakash jha new jersey talkin...   
3          3  RishiJoeSanu  would politicians stop using religion politics...   

   Likes  Retweets                                     Original_Tweet  \
0    0.0       0.0  @vinodkapri @RahulGandhi Respected Indian Citi...   
1    0.0       0.0  *Respected Indian Citizens,* Namaskaar I Am Th...   
2    0.0       0.0  1/n-Meet Filmmaker Prakash Jha in New Jersey t...   
3    0.0       0.0  @MrinalWahal Why would politicians stop using ...   

             DateTime        date  month  year  hour  Polarity Sentiment  
0 2023-03-29 15:42:36  2023-03-29      3  2023    15 -0.062500  Negative  
1 2023-03-29 15:42:05  2023-03-29     