On this Data Clean process for tweets we will apply a Filter, remove noise data like accents, urls, hashtags, among others. The output is an csv with data clean to be used in the sentiment analysis model and topic modeling

In [None]:
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import glob
import os
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
import re, string

## At glace

In [None]:
# Get all Tweets file paths
raw_data_path = r'data/raw/*Tweets*.csv'
csv_files = glob(raw_data_path)
csv_files

## Read CSV

Initial dataset files are not available on the path mentioned due to space limitation on GitHub. Once you have this project downloaded in your local machine download such as csv files from below link and place them at "data/raw/"

https://www.kaggle.com/smid80/coronavirus-covid19-tweets-early-april

In [None]:
use_columns = ['created_at','text', 'retweet_count', 'country_code', 'followers_count', 'lang', 'screen_name']
data_frames = []
for file in csv_files:
    df = pd.read_csv(file, encoding='utf-8', usecols=use_columns)
    data_frames.append(df)
df = pd.concat(data_frames)
del data_frames
df.head()

## Filter data tweets

In [None]:
# Filter by tweets form Mexico only
mx_twitts = df.query("country_code == 'MX'")

## Removing Noise from the Data

In [None]:
# .words() method to get a list of stop words in Spanish
stop_words = stopwords.words('spanish')
# stopwords is extended by adding unncessary words
stop_words.extend(['coronavirus','covid19', 'covid_19','pandemia','cuarentena','covid','mexico','covid2019','covid19mx','cdmx','covidー19','coronaviru','coronavid19','coronavirusmexico','quedateencasa','tiempo','solo','aqui','caso','casa','pais','casos','casas','ahora','gente','persona','mundo','momento','parte','dice','toda','hacer','hace','ssalud','yomequedoencasa','mexicano','cosa','pues','video','dias','puede','personas','mismo','tema','importante','tiempos','medida','nuevo'])

# word accents from stopword list are removed for the clean process
stop_words = [re.sub('á','a', i) for i in stop_words]
stop_words = [re.sub('é','e', i) for i in stop_words]
stop_words = [re.sub('í','i', i) for i in stop_words]
stop_words = [re.sub('ó','o', i) for i in stop_words]
stop_words = [re.sub('ú','u', i) for i in stop_words]

print(stop_words)

In [None]:
# Remove output warning message
pd.options.mode.chained_assignment = None

# Transform all words in lower case in a new column called "clean_text"
mx_twitts["clean_text"] = mx_twitts["text"].str.lower()

# word accents from tweet list are removed for the clean process
mx_twitts.replace('á','a', regex=True, inplace=True)
mx_twitts.replace('é','e', regex=True, inplace=True)
mx_twitts.replace('í','i', regex=True, inplace=True)
mx_twitts.replace('ó','o', regex=True, inplace=True)
mx_twitts.replace('ú','u', regex=True, inplace=True)

# remove from words urls, tags, hastashs, special characters and words which contain 1 to 3 letters.  
mx_twitts['clean_text'].replace('http\S+','',regex=True, inplace = True)
mx_twitts['clean_text'] = mx_twitts['clean_text'].map(lambda x: re.sub(r'@\S+', ' ', x))
mx_twitts['clean_text'] = mx_twitts['clean_text'].map(lambda x: re.sub(r'#\S+', ' ', x))
mx_twitts['clean_text'] = mx_twitts['clean_text'].map(lambda x: re.sub(r'[^a-zñ]+', ' ', x))
mx_twitts['clean_text'] = mx_twitts['clean_text'].map(lambda x: re.sub(r'\b\w{1,3}\b', ' ', x))

# remove stopwords 
mx_twitts['clean_text']=mx_twitts['clean_text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))

# Set "NaN" to create a NaN value for those empty fields after cleaning process
mx_twitts.replace("", float("NaN"), inplace=True)

# drop all rows that contain NaN under text clean column.
mx_twitts.dropna(subset = ["clean_text"], inplace=True)

mx_twitts[['text','clean_text']]

In [None]:
text = (" ").join(mx_twitts.clean_text.tolist())

# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)

# Display the generated image:
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

# Export clean es twitts to CSV

The exported clean tweets will be used in the sentiment analysis model and topic modeling

In [None]:
## Export csv twitts in Spanish
mx_twitts.to_csv(r'data\clean\mx_twitts.csv', index = None, header=True)