In [1]:
import os
import pandas as pd
import re 
import string
import nltk #pip install nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download("punkt")

from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.tokenize import TweetTokenizer,word_tokenize

import preprocessor as p #pip install tweet-preprocessor
import contractions #pip install contractions

 
from googletrans import Translator  # pip install googletrans  (pip install --upgrade setuptools)


[nltk_data] Downloading package stopwords to /home/lucia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/lucia/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/lucia/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
#Count rows in a csv file
def get_n_rows(file_path):
    df = pd.read_csv(file_path)
    return len(df)

#Riconoscimento del linguaggio di un t4esto
def detect_language(text):
    try:
        translator = Translator()
        result = translator.detect(text)
        if result.lang !='en':
            print("not english: ", text)
        return result.lang
    except:
        return None  

In [4]:
#Merging all the csv files into one
source = './scraper_new/files/'
destination = './doc'

#Creating merged.csv with header
data = {'username': [],
        'date': [],
        'text': []}

df = pd.DataFrame(data)

file_name = '/merged.csv'
df.to_csv(destination + file_name, index=False, header=True)

#Merging all the csv files into one
for file in os.listdir(source):
    print("file: ", file)
    if file.endswith(".csv"):
        df = pd.read_csv(source + file) #leggo il csv nel dataframe
        
        #Controllo il linguaggio degli hashtag
        if file.startswith("#"): 
            df['language'] = df['text'].apply(detect_language) 
            df = df[df['language'] == 'en']
            # Rimuovo la colonna language
            df.drop(columns=['language'], inplace=True)

        #Salvo il file            
        df.to_csv(destination + '/merged.csv', mode='a', header=False, index=False)
        

file=  @CollinRugg.csv
not:  San Diego, CA 🇺🇸 https://t.co/TbLJtjLfKp
not:  Joe Biden lied.
https://t.co/4269Ii0pSQ
file=  @TheDemocrats.csv
not:  RT @Two nations. One clear, firm, and unwavering message: We stand together.

Дві країни. Один чіткий, твердий і непохитний сигнал: ми разом. https://t.co/d5VGFt8nJk
not:  Two nations. One clear, firm, and unwavering message: We stand together.

Дві країни. Один чіткий, твердий і непохитний сигнал: ми разом. https://t.co/d5VGFt8nJk
file=  @Nate_Cohn.csv


In [18]:
print("Numero righe file merged: ", get_n_rows(destination + "/merged.csv"))

Numero righe file merged:  84512


In [25]:
stop_words = set(stopwords.words('english'))

additional_stopwords = ['breaking', 'report' ,'new'] #parole usate all'inizio di articoli
stop_words.update(additional_stopwords)
#print(stop_words)

lemmatizer = nltk.stem.WordNetLemmatizer()
w_tokenizer =  TweetTokenizer()

#Funzione per la pulizia del testo
def cleaning(text):
    
    #0) Check if english
    translator = Translator()
    detected_language = translator.detect(text).lang
    if detected_language != 'en':
        print("NOT english: ", text)
        return '', '', ''
    
    
    #1) Removing more than 1 space
    text = ' '.join(text.split())
    
    #2) text in lowecase
    text = text.lower()
    
    #3) Removing URLs, emojis, mentions, hashtags, RESERVED WORDS (RT, FAV), SMILEYS, NUMBERS
    p.set_options(p.OPT.URL, p.OPT.EMOJI, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.RESERVED, p.OPT.SMILEY, p.OPT.NUMBER)
    text = p.clean(text)
    
    #4) Normalizzare parole in forma contratta: isn't -> is not
    text = ' '.join([contractions.fix(word) for word in text.split()])
    
    #5) Rimozione punteggiature 
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    
    #5.5) Rimozione parole contenenti numeri
    pattern = r'\b\w*\d\w*\b'
    text = re.sub(pattern, '', text)
    
    #6) Rimozione caratteri singoli
    pattern = r'\b\w\b' #pattern to match single character words
    text = re.sub(pattern, '', text)
    
    #7) Rimozione stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    
    #8) Lemmatization (trasformo le parole alla forma base: says -> say. Trasformazioni abbastanza leggere: dal plurale al singolare)
    lemming = ' '.join([(lemmatizer.lemmatize(w)) for w in word_tokenize((text))])
    
    #9) Stemming
    stemmer = PorterStemmer()
    # Apply stemming to each word and join them into a single string
    stemming = ' '.join([stemmer.stem(w) for w in word_tokenize((text))])
    
    #https://www.datacamp.com/tutorial/stemming-lemmatization-python

    
    return text, lemming, stemming


In [26]:

source_merged = './doc/merged.csv'
destination_cleaned = './doc/cleaned.csv'

new_data = {'username': [],
        'date': [],
        'text': [],
        'cleaned_text': [],
        'lemmantized_text': [],
        'stemmed_text': []}


#Pre-processing
df = pd.read_csv(source_merged)

for index, row in df.iterrows():
    if index > 900:
        break
    
    #Cleaning
    cleaned_text, lemming, stemming = cleaning(row['text'])
    
    #Saving
    if cleaned_text == '' or cleaned_text in new_data['cleaned_text']: #Skippo se il testo è uguale a quello di un altro tweet!
        continue
    new_data['username'].append(row['username'])
    new_data['date'].append(row['date'])
    new_data['text'].append(' '.join(row['text'].split())) #Elimino gli spazi multipli per leggibilità
    new_data['cleaned_text'].append(cleaned_text)
    new_data['lemmantized_text'].append(lemming)
    new_data['stemmed_text'].append(stemming)
    
    print('\n\n',index,'original text: ',row['username'], ' - ', new_data['text'][-1])
    print("trasformed text: ", cleaned_text)
    print("lemming: ", lemming)
    print("stemming : ", stemming)
    
new_df = pd.DataFrame(new_data)
new_df.to_csv(destination_cleaned, index=False, header=True)


    
print("numero righe file pulito: ", get_n_rows(destination_cleaned))
    




 0 original text:  CollinRugg  -  BREAKING: Senator Dianne Feinstein has died at the age of 90. https://t.co/PWFmowW5lu
trasformed text:  senator dianne feinstein died age
lemming:  senator dianne feinstein died age
stemming :  senat diann feinstein die age


 1 original text:  CollinRugg  -  JUST IN: Alexandria Ocasio Cortez has posted on X about how upset she is that Elon Musk visited the southern border to examine the migrant crisis. Why is AOC so obsessed with Elon? Instead of working to fix the migrant crisis, AOC whined about Musk taking a “joyride” with Republican Rep. Tony Gonzalez. “The House is holding important votes in DC tonight, people are scrambling to avoid a shutdown, but this Republican Congressman decided to skip town to joyride with a billionaire when his own party has just a single-digit margin and needs his vote.” Imagine if she cared this much about securing the border!
trasformed text:  alexandria ocasio cortez posted upset elon musk visited southern border ex

IndexError: list index out of range