# Text preprocessing

Module of the project about data preparation, more in particular about text analysis.

### Import libraries

In [13]:
import pandas as pd 
import numpy as np
import emoji
import re
from sklearn.model_selection import GridSearchCV
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer


### Import the dataset

In [14]:
data = pd.read_csv('../../datasets/data-prepared.csv')
data.info()
data.head(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4979 entries, 0 to 4978
Data columns (total 16 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   text                 4977 non-null   object 
 1   quotes               4979 non-null   int64  
 2   is.retweet           4979 non-null   bool   
 3   external.link        4979 non-null   bool   
 4   pictures             4979 non-null   bool   
 5   videos               4979 non-null   bool   
 6   gifs                 4979 non-null   bool   
 7   multimedial_content  4979 non-null   bool   
 8   user.image           4979 non-null   bool   
 9   user.bio             4979 non-null   bool   
 10  user.website         4979 non-null   bool   
 11  user.tweets          4979 non-null   int64  
 12  user.following       4979 non-null   int64  
 13  user.media           4979 non-null   int64  
 14  engagement.rate      4979 non-null   float64
 15  n_hashtags           4979 non-null   i

Unnamed: 0,text,quotes,is.retweet,external.link,pictures,videos,gifs,multimedial_content,user.image,user.bio,user.website,user.tweets,user.following,user.media,engagement.rate,n_hashtags
0,Fried Chicken with Hot 🔥 Sauce Corn Muffin and...,1,True,False,True,False,False,True,True,True,True,16213,549,15698,0.571817,8
1,#Morel #mushrooms at center of #food #poisonin...,0,False,False,False,False,False,False,True,True,True,50098,513,1962,0.0,10
2,My Blog:: Obama Legacy Facing REVISIT ... Ian...,0,False,False,False,False,False,False,True,True,True,5174,9022,0,0.008527,11
3,Craving sushi but tired of expensive takeout? ...,0,False,False,False,False,False,False,True,True,True,108071,678,49647,0.0,6
4,Potato Pancakes should not be a rare side dish...,0,False,False,True,False,False,True,True,True,True,28081,2343,18032,0.0,8


Function to replace eventual emojis

In [15]:
def replace_emojis(text):
    return emoji.demojize(str(text))

data['processed text']= data['text'].apply(replace_emojis)
data.head(5)

Unnamed: 0,text,quotes,is.retweet,external.link,pictures,videos,gifs,multimedial_content,user.image,user.bio,user.website,user.tweets,user.following,user.media,engagement.rate,n_hashtags,processed text
0,Fried Chicken with Hot 🔥 Sauce Corn Muffin and...,1,True,False,True,False,False,True,True,True,True,16213,549,15698,0.571817,8,Fried Chicken with Hot :fire: Sauce Corn Muffi...
1,#Morel #mushrooms at center of #food #poisonin...,0,False,False,False,False,False,False,True,True,True,50098,513,1962,0.0,10,#Morel #mushrooms at center of #food #poisonin...
2,My Blog:: Obama Legacy Facing REVISIT ... Ian...,0,False,False,False,False,False,False,True,True,True,5174,9022,0,0.008527,11,My Blog:: Obama Legacy Facing REVISIT ... Ian...
3,Craving sushi but tired of expensive takeout? ...,0,False,False,False,False,False,False,True,True,True,108071,678,49647,0.0,6,Craving sushi but tired of expensive takeout? ...
4,Potato Pancakes should not be a rare side dish...,0,False,False,True,False,False,True,True,True,True,28081,2343,18032,0.0,8,Potato Pancakes should not be a rare side dish...


## Text processing

In this part of the code we proceed by erasing punctuation, stopwords, and stemming the text of the tweets.

In [16]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)  # remove punctuations
    text = re.sub(r'\d+', '', text)     # remove digits
    words = text.split()
    words = [word for word in words if word not in stop_words]
    words = [stemmer.stem(word) for word in words]
    proc_text = " ".join(words)
    return proc_text

data['processed text'] = data['processed text'].apply(clean_text)
print(data['processed text'][0:5])
data.head(5)

0    fri chicken hot fire sauc corn muffin season f...
1    morel mushroom center food poison outbreak fun...
2    blog obama legaci face revisit ianrmackintoshb...
3    crave sushi tire expens takeout look kellylov ...
4    potato pancak rare side dish plan leftov time ...
Name: processed text, dtype: object


Unnamed: 0,text,quotes,is.retweet,external.link,pictures,videos,gifs,multimedial_content,user.image,user.bio,user.website,user.tweets,user.following,user.media,engagement.rate,n_hashtags,processed text
0,Fried Chicken with Hot 🔥 Sauce Corn Muffin and...,1,True,False,True,False,False,True,True,True,True,16213,549,15698,0.571817,8,fri chicken hot fire sauc corn muffin season f...
1,#Morel #mushrooms at center of #food #poisonin...,0,False,False,False,False,False,False,True,True,True,50098,513,1962,0.0,10,morel mushroom center food poison outbreak fun...
2,My Blog:: Obama Legacy Facing REVISIT ... Ian...,0,False,False,False,False,False,False,True,True,True,5174,9022,0,0.008527,11,blog obama legaci face revisit ianrmackintoshb...
3,Craving sushi but tired of expensive takeout? ...,0,False,False,False,False,False,False,True,True,True,108071,678,49647,0.0,6,crave sushi tire expens takeout look kellylov ...
4,Potato Pancakes should not be a rare side dish...,0,False,False,True,False,False,True,True,True,True,28081,2343,18032,0.0,8,potato pancak rare side dish plan leftov time ...


Now apply TF-IDF vectorization of the preprocessed text. We chose an approach like tf-idf as it considers the importance of a word relative to the frequence across all the documents, penalizing common words and enhancing less common and more informative words.
Another approach would have been using BOW, but it treats all words equally, disregarding of their importance and rarity.

In [5]:
# # Perform some parameter tuning
# from sklearn.linear_model import LinearRegression
# from sklearn.metrics import mean_squared_error, make_scorer
# # Create a TF-IDF vectorizer
# vectorizer = TfidfVectorizer()

# # Define a range of parameters to search (in this case, for 'max_df')
# param_grid = {
#     'max_df': [0.7, 0.8, 0.9],
#     'min_df': [0.01, 0.05, 0.1],
#     'ngram_range': [(1, 1), (1, 2), (1, 3)],
# }

# lin_regressor = LinearRegression()
# scoring = make_scorer(mean_squared_error, greater_is_better=False)

# # Initialize GridSearchCV
# grid_search = GridSearchCV(lin_regressor, param_grid, cv=5, scoring=scoring)
# dtm = grid_search.fit(data['processed text'], data['engagement.rate'])

# # Get the best parameter
# best_params = grid_search.best_params_
# print(f"Best max_df parameter found: {best_params}")

# # Now, you can use this best parameter in your TfidfVectorizer
# final_vectorizer = TfidfVectorizer(**best_params)
# final_dtm = final_vectorizer.fit_transform(data['processed text'])

# # Convert DTM to DataFrame
# dtm_df = pd.DataFrame(final_dtm.toarray(), columns=final_vectorizer.get_feature_names_out())

# # Merge the two dataframes
# data = pd.concat([data, dtm_df], axis=1)

In [17]:
# label encoding
# max_df will be set with regard of the best parameter found in the previous step
vectorizer = TfidfVectorizer(max_df=0.80) # ignore terms that appear in more than 80% of the documents
dtm = vectorizer.fit_transform(data['processed text'])
# DTM is a document-term matrix with shape (n_samples, n_features) having as rows the documents and as columns the terms learned as vocabulary
# The value of each cell is the tf-idf score

# Convert sparse matrix to dense matrix
dense_matrix = dtm.toarray()

# Display non-zero elements
non_zero_elements = dense_matrix[dense_matrix != 0]
print(len(non_zero_elements))
print(non_zero_elements)

# Convert the DTM to a dataframe
dtm_df = pd.DataFrame(dtm.toarray(), columns=vectorizer.get_feature_names_out())

# Merge the two dataframes
data = pd.concat([data, dtm_df], axis=1)
data.fillna(0, inplace=True)
data.head(5)

81925
[0.17888419 0.27322839 0.20688825 ... 0.24840209 0.86290724 0.50536233]


Unnamed: 0,text,quotes,is.retweet,external.link,pictures,videos,gifs,multimedial_content,user.image,user.bio,...,𝗪𝗲𝗲𝗸𝗲𝗻𝗱,𝗪𝗵𝗮𝘁,𝗳𝗮𝘃𝗿𝗶𝗼𝘂𝘁𝗲,𝗶𝘀,𝘆𝗼𝘂𝗿,𝘼𝙧𝙚𝙣𝙖,𝙁𝙤𝙪𝙣𝙩𝙖𝙞𝙣,𝙇𝙖𝙗𝙒𝙤𝙧𝙡𝙙,𝙔𝙤𝙪𝙩𝙝,𝙤𝙛
0,Fried Chicken with Hot 🔥 Sauce Corn Muffin and...,1,True,False,True,False,False,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,#Morel #mushrooms at center of #food #poisonin...,0,False,False,False,False,False,False,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,My Blog:: Obama Legacy Facing REVISIT ... Ian...,0,False,False,False,False,False,False,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Craving sushi but tired of expensive takeout? ...,0,False,False,False,False,False,False,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Potato Pancakes should not be a rare side dish...,0,False,False,True,False,False,True,True,True,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


: 

## Export the vectorized data into a csv

In [12]:
data.to_csv('../../datasets/data-stemmed.csv', index=False)