# Text preprocessing

Module of the project about data preparation, more in particular about text analysis.

### Import libraries

In [50]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import emoji
import re
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder

### Import the dataset

In [51]:
data = pd.read_csv('../../datasets/tweets&users-data.csv')
data.head(5)

Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs,user-image,user-bio,user-website,user-tweets,user-following,user-followers,user-likes,user-media
0,These Black Pitmasters Are Hustling To Preserv...,@FoodsAreGoodYes,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OIEcaYAAOQEF....,[],[],https://pbs.twimg.com/profile_images/170867034...,I love foods and definitely love writing the f...,https://foodsaregood.com,1029.0,1.0,6.0,1.0,861.0
1,#ai #food #foodporn #foodlover,@yummy_food_ai_,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0ODvvWkAARMm-....,[],[],https://pbs.twimg.com/profile_images/173305540...,AI generated images,https://instagram.com/yummy_food_ai_?igshid=Mz...,16.0,184.0,33.0,628.0,16.0
2,Chicken Licken Menu View South african menu; ...,@M14548Mehsud,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OG2IbIAAVBBw....,[],[],https://pbs.twimg.com/profile_images/170954226...,🌟 Join us as we take your taste buds on a glob...,http://chicken-licken-menu.info,1151.0,10.0,1.0,102.0,615.0
3,Pepperoni Pizza 🍕 with Peppers homecookingvsfa...,@homevsfastfood,345,4,63,1,True,,['https://pbs.twimg.com/media/GAx6QMZXIAEGXyx....,[],[],https://pbs.twimg.com/profile_images/129259871...,Great food and Recipes #homecookingvsfastfood ...,http://homecookingvsfastfood.com,16136.0,550.0,27938.0,11591.0,15625.0
4,#Fine #Dining #Dinner #flashback #Focused #Che...,@ChefLondie,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OBqXXkAA7Vpe....,[],[],https://pbs.twimg.com/profile_images/157618315...,Let Us Turn Your Ordinary Event Into An Extrao...,,4790.0,4.0,118.0,14563.0,2233.0


Function to replace eventual emojis

In [52]:
def replace_emojis(text):
    return emoji.demojize(text)

data['processed text']= data['text'].apply(replace_emojis)
data.head(5)

Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs,user-image,user-bio,user-website,user-tweets,user-following,user-followers,user-likes,user-media,processed text
0,These Black Pitmasters Are Hustling To Preserv...,@FoodsAreGoodYes,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OIEcaYAAOQEF....,[],[],https://pbs.twimg.com/profile_images/170867034...,I love foods and definitely love writing the f...,https://foodsaregood.com,1029.0,1.0,6.0,1.0,861.0,These Black Pitmasters Are Hustling To Preserv...
1,#ai #food #foodporn #foodlover,@yummy_food_ai_,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0ODvvWkAARMm-....,[],[],https://pbs.twimg.com/profile_images/173305540...,AI generated images,https://instagram.com/yummy_food_ai_?igshid=Mz...,16.0,184.0,33.0,628.0,16.0,#ai #food #foodporn #foodlover
2,Chicken Licken Menu View South african menu; ...,@M14548Mehsud,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OG2IbIAAVBBw....,[],[],https://pbs.twimg.com/profile_images/170954226...,🌟 Join us as we take your taste buds on a glob...,http://chicken-licken-menu.info,1151.0,10.0,1.0,102.0,615.0,Chicken Licken Menu View South african menu; ...
3,Pepperoni Pizza 🍕 with Peppers homecookingvsfa...,@homevsfastfood,345,4,63,1,True,,['https://pbs.twimg.com/media/GAx6QMZXIAEGXyx....,[],[],https://pbs.twimg.com/profile_images/129259871...,Great food and Recipes #homecookingvsfastfood ...,http://homecookingvsfastfood.com,16136.0,550.0,27938.0,11591.0,15625.0,Pepperoni Pizza :pizza: with Peppers homecooki...
4,#Fine #Dining #Dinner #flashback #Focused #Che...,@ChefLondie,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OBqXXkAA7Vpe....,[],[],https://pbs.twimg.com/profile_images/157618315...,Let Us Turn Your Ordinary Event Into An Extrao...,,4790.0,4.0,118.0,14563.0,2233.0,#Fine #Dining #Dinner #flashback #Focused #Che...


## Text processing

In this part of the code we proceed by erasing punctuation, stopwords, and stemming the text of the tweets.

In [53]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuations
    text = re.sub(r'\d+', '', text)     # remove digits
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    proc_text = " ".join(words)
    return proc_text

data['processed text'] = data['processed text'].apply(clean_text)
print(data['processed text'][0:5])
data.head(5)

0    black pitmast hustl preserv barbecu root https...
1                             ai food foodporn foodlov
2    chicken licken menu view south african menu ho...
3    pepperoni pizza pizza pepper homecookingvsfast...
4    fine dine dinner flashback focus cheffinitup s...
Name: processed text, dtype: object


Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,gifs,user-image,user-bio,user-website,user-tweets,user-following,user-followers,user-likes,user-media,processed text
0,These Black Pitmasters Are Hustling To Preserv...,@FoodsAreGoodYes,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OIEcaYAAOQEF....,[],[],https://pbs.twimg.com/profile_images/170867034...,I love foods and definitely love writing the f...,https://foodsaregood.com,1029.0,1.0,6.0,1.0,861.0,black pitmast hustl preserv barbecu root https...
1,#ai #food #foodporn #foodlover,@yummy_food_ai_,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0ODvvWkAARMm-....,[],[],https://pbs.twimg.com/profile_images/173305540...,AI generated images,https://instagram.com/yummy_food_ai_?igshid=Mz...,16.0,184.0,33.0,628.0,16.0,ai food foodporn foodlov
2,Chicken Licken Menu View South african menu; ...,@M14548Mehsud,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OG2IbIAAVBBw....,[],[],https://pbs.twimg.com/profile_images/170954226...,🌟 Join us as we take your taste buds on a glob...,http://chicken-licken-menu.info,1151.0,10.0,1.0,102.0,615.0,chicken licken menu view south african menu ho...
3,Pepperoni Pizza 🍕 with Peppers homecookingvsfa...,@homevsfastfood,345,4,63,1,True,,['https://pbs.twimg.com/media/GAx6QMZXIAEGXyx....,[],[],https://pbs.twimg.com/profile_images/129259871...,Great food and Recipes #homecookingvsfastfood ...,http://homecookingvsfastfood.com,16136.0,550.0,27938.0,11591.0,15625.0,pepperoni pizza pizza pepper homecookingvsfast...
4,#Fine #Dining #Dinner #flashback #Focused #Che...,@ChefLondie,0,0,0,0,False,,['https://pbs.twimg.com/media/GA0OBqXXkAA7Vpe....,[],[],https://pbs.twimg.com/profile_images/157618315...,Let Us Turn Your Ordinary Event Into An Extrao...,,4790.0,4.0,118.0,14563.0,2233.0,fine dine dinner flashback focus cheffinitup s...


Now apply TF-IDF vectorization of the preprocessed text. We chose an approach like tf-idf as it considers the importance of a word relative to the frequence across all the documents, penalizing common words and enhancing less common and more informative words.
Another approach would have been using BOW, but it treats all words equally, disregarding of their importance and rarity.

In [54]:
# # Perform some parameter tuning
# # Create a TF-IDF vectorizer
# vectorizer = TfidfVectorizer()

# # Define a range of parameters to search (in this case, for 'max_df')
# param_grid = {'max_df': [0.7, 0.8, 0.85, 0.9]}

# # Initialize GridSearchCV
# grid_search = GridSearchCV(vectorizer, param_grid, cv=5, scoring='neg_mean_squared_error')

# # Fit GridSearchCV to your data
# grid_search.fit(data['processed text'], y)
# # y is the target variable, in this case engagement rate

# # Get the best parameter
# best_max_df = grid_search.best_params_['max_df']
# print(f"Best max_df parameter found: {best_max_df}")

# # Now, you can use this best parameter in your TfidfVectorizer
# final_vectorizer = TfidfVectorizer(max_df=best_max_df)
# final_dtm = final_vectorizer.fit_transform(data['processed text'])

# # Convert DTM to DataFrame
# dtm_df = pd.DataFrame(final_dtm.toarray(), columns=final_vectorizer.get_feature_names_out())

# # Merge the two dataframes
# data = pd.concat([data, dtm_df], axis=1)

In [55]:
# label encoding
# max_df will be set with regard of the best parameter found in the previous step
vectorizer = TfidfVectorizer(max_df=0.80) # ignore terms that appear in more than 80% of the documents
dtm = vectorizer.fit_transform(data['processed text'])
# DTM is a document-term matrix with shape (n_samples, n_features) having as rows the documents and as columns the terms learned as vocabulary
# The value of each cell is the tf-idf score

# Convert sparse matrix to dense matrix
dense_matrix = dtm.toarray()

# Display non-zero elements
non_zero_elements = dense_matrix[dense_matrix != 0]
print(len(non_zero_elements))
print(non_zero_elements)

# Convert the DTM to a dataframe
dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

# Merge the two dataframes
data = pd.concat([data, dtm_df], axis=1)
data.fillna(0, inplace=True)
data.head(5)

25495
[0.17101553 0.22361378 0.37367624 ... 0.25818978 0.25818978 0.25818978]


Unnamed: 0,text,username,likes,comments,retweets,quotes,is-retweet,external-link,pictures,videos,...,𝓲𝓽,𝓷𝓮𝓿𝓮𝓻,𝗢𝘂𝗿,𝗣𝗶𝘇𝘇𝗮pizzapizza,𝗩𝗶𝘀𝗶𝘁,𝗪𝗲𝗯𝘀𝗶𝘁𝗲,𝗪𝗵𝗮𝘁,𝗳𝗮𝘃𝗿𝗶𝗼𝘂𝘁𝗲,𝗶𝘀,𝘆𝗼𝘂𝗿
0,These Black Pitmasters Are Hustling To Preserv...,@FoodsAreGoodYes,0,0,0,0,False,0,['https://pbs.twimg.com/media/GA0OIEcaYAAOQEF....,[],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,#ai #food #foodporn #foodlover,@yummy_food_ai_,0,0,0,0,False,0,['https://pbs.twimg.com/media/GA0ODvvWkAARMm-....,[],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Chicken Licken Menu View South african menu; ...,@M14548Mehsud,0,0,0,0,False,0,['https://pbs.twimg.com/media/GA0OG2IbIAAVBBw....,[],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Pepperoni Pizza 🍕 with Peppers homecookingvsfa...,@homevsfastfood,345,4,63,1,True,0,['https://pbs.twimg.com/media/GAx6QMZXIAEGXyx....,[],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,#Fine #Dining #Dinner #flashback #Focused #Che...,@ChefLondie,0,0,0,0,False,0,['https://pbs.twimg.com/media/GA0OBqXXkAA7Vpe....,[],...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
