# Importing necessary libraries:

In [None]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from wordcloud import WordCloud
import matplotlib.pyplot as plt


Loading the twitter data:

In [None]:
data = pd.read_csv('saf_data.csv')

Cleaning the data:

In [None]:
# Remove URLs
df['comment'] = df['comment'].str.replace(r'http\S+', '', regex=True).replace(r'www\S+', '', regex=True)

# Remove special characters
df['comment'] = df['comment'].str.replace(r'[^\w\s]', '')

# Convert text to lowercase
df['comment'] = df['comment'].str.lower()

# Tokenize the text data
df['comment'] = df['comment'].apply(word_tokenize)

# Remove stopwords
stop_words = set(stopwords.words('english'))
df['comment'] = df['comment'].apply(lambda x: [word for word in x if word not in stop_words])

Sample of stemmed data:

In [None]:
stemmer = PorterStemmer()
df['stemmed_comment'] = df['comment'].apply(lambda x: [stemmer.stem(word) for word in x])

# Display a sample of the stemmed data
df['stemmed_comment'].head()

Lemmatized data:

In [None]:
lemmatizer = WordNetLemmatizer()
df['lemmatized_comment'] = df['comment'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Display a sample of the lemmatized data
df['lemmatized_comment'].head()
    

Using word cloud to visualize data:

In [None]:
all_words = ' '.join([word for word_list in data['lemmatized_comment'] for word in word_list])
wordcloud = WordCloud(width=800, height=500, random_state=21, max_font_size=110).generate(all_words)
plt.figure(figsize=(10, 7))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis('off')
plt.show()