In [23]:
import pickle
import pandas as pd
import numpy as np
import spacy
import re
import stanza
import sqlite3
import time
import string

In [7]:
stanza.download('it')
it_nlp_stanza = stanza.Pipeline('it', processors='tokenize,mwt,pos,lemma', verbose=False, use_gpu=False)

try:
    it_nlp_spacy = spacy.load('it_core_news_lg')
except OSError:
    download('it_core_news_lg')
    it_nlp_spacy = spacy.load('it_core_news_lg')



In [8]:
conn = sqlite3.connect('/g100_work/IscrC_mental/data/database/MENTALISM.db')

In [18]:
from spacy.cli import download
from spacy.lang.it.stop_words import STOP_WORDS

In [43]:
def download_batch(batch_size):
    start_time = time.time()
    your_query = f"SELECT tweet_id, text, language FROM tweets limit {batch_size}"
    df = pd.read_sql_query(your_query, conn)
    elapsed_time = time.time() - start_time
    print(f"\nTime taken to download batch of size {batch_size}: {elapsed_time:.2f} seconds")
    return df


def itallian_df(df):
    start_time = time.time()
    itallian_df = df[df['language'] == 'it']
    print(f"\nTime taken for 'itallian_df': {time.time() - start_time:.2f} seconds")
    print(f"\nLength of downloaded df {len(df)}")
    return itallian_df

def cleaning_df(df):
    start_time = time.time()
    def cleaning(text):
        
        stop_words = STOP_WORDS
        stop_words_to_remove = ['anni', 'anno']
        stop_words = [word for word in stop_words if word not in stop_words_to_remove]
        if pd.isna(text):
                return ""
        words_new = [word[1:].translate(str.maketrans('', '', string.punctuation)) 
        if (word.startswith('#') or word.startswith('@')) else word.lower().translate(str.maketrans('', '', string.punctuation))
        for word in text.split()
        if not (re.match(r"http\S+|www\S+|https\S+", word) or word.lower() in stop_words)]
        filtered_text = ' '.join(words_new)
        return filtered_text

    df['text'] = df['text'].apply(cleaning)
    print(f"\nTime taken for 'cleaning_df': {time.time() - start_time:.2f} seconds")
    return df


def lemmatizationSpacy_df(df, nlp_model=it_nlp_spacy):
    start_time = time.time()
    def lemmatizationSpacy(text):
        tokens = nlp_model(text)
        lemmatized_tweet = " ".join([token.lemma_ for token in tokens])
        return lemmatized_tweet
    df['text'] = df['text'].apply(lemmatizationSpacy)
    print(f"\nTime taken for 'spacy': {time.time() - start_time:.2f} seconds")
    return df


def lemmatizationStanza_df(df):
    start_time = time.time()
    def lemmatizationStanza(text):
        doc = it_nlp_stanza(text)
        lemmatized_text = ' '.join([word.lemma if word.lemma.endswith((',', '.')) else word.lemma + ' ' for sent in doc.sentences for word in sent.words])
        return lemmatized_text
    df['text'] = df['text'].apply(lemmatizationStanza)
    print(f"\nTime taken for 'Stanza': {time.time() - start_time:.2f} seconds")
    return(df)


def save_to_csv(df, output_file='output_file.csv'):
    start_time = time.time()
    df.to_csv(output_file, index=False)
    print(f"\nDataFrame saved to {output_file}")
    print(f"\nTime taken for 'saving': {time.time() - start_time:.2f} seconds")


In [72]:
batch = download_batch(10000)


Time taken to download batch of size 10000: 0.48 seconds


In [73]:
ital = itallian_df(batch)


Time taken for 'itallian_df': 0.23 seconds

Length of downloaded df 10000


In [76]:
ital = df.drop('language', axis =1) 

In [77]:
ital

Unnamed: 0,tweet_id,text
8,1002833,preparare accomparire mamma regalo natale
39,2002923,cercare capire funzionare
65,2054033,colpa beggio
67,2057643,cercare capire funzionare twitter
91,2082733,tagliare capello
...,...,...
9987,173390792,motokrzr
9989,173443782,rosso spellare peperona
9991,173610562,luna
9996,174264072,buondì twitters


In [52]:
    def cleaning(text):
        
        stop_words = STOP_WORDS
        stop_words_to_remove = ['anni', 'anno']
        stop_words = [word for word in stop_words if word not in stop_words_to_remove]
        if pd.isna(text):
                return ""
        words_new = [word[1:].translate(str.maketrans('', '', string.punctuation)) 
        if (word.startswith('#') or word.startswith('@')) else word.lower().translate(str.maketrans('', '', string.punctuation))
        for word in text.split()
        if not (re.match(r"http\S+|www\S+|https\S+", word) or word.lower() in stop_words)]
        filtered_text = ' '.join(words_new)
        return filtered_text

In [63]:
print(ital['text'].iloc[0])

mi preparo e poi accompagno mamma a fare regali di natale


In [64]:
cleaning(ital['text'].iloc[0])

'preparo accompagno mamma regali natale'

In [65]:
clean = cleaning_df(ital)


Time taken for 'cleaning_df': 0.79 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(cleaning)


In [69]:
ital

Unnamed: 0,tweet_id,text,language
8,1002833,mi preparo e poi accompagno mamma a fare regal...,it
39,2002923,sto cercando di capire come funziona!!!!!!!,it
65,2054033,Tutta colpa di Beggi!!!,it
67,2057643,cerco di capire come funziona questo twitter\n,it
91,2082733,Ho tagliato i capelli (tutti),it
...,...,...,...
9987,173390792,@tutti qualcuno possiede un motokrzr?,it
9989,173443782,"è rossa spellata, è come un peperone.",it
9991,173610562,"È quasi pieno, come la luna",it
9996,174264072,BuonDì Twitters,it


In [70]:
clean

Unnamed: 0,tweet_id,text,language
8,1002833,preparo accompagno mamma regali natale,it
39,2002923,cercando capire funziona,it
65,2054033,colpa beggi,it
67,2057643,cerco capire funziona twitter,it
91,2082733,tagliato i capelli tutti,it
...,...,...,...
9987,173390792,tutti possiede motokrzr,it
9989,173443782,rossa spellata peperone,it
9991,173610562,pieno luna,it
9996,174264072,buondì twitters,it


In [49]:
ital['text'] == clean['text']

8       True
39      True
65      True
67      True
91      True
        ... 
9987    True
9989    True
9991    True
9996    True
9999    True
Name: text, Length: 5766, dtype: bool

In [71]:
df

Unnamed: 0,tweet_id,text,language
8,1002833,preparare accomparire mamma regalo natale,it
39,2002923,cercare capire funzionare,it
65,2054033,colpa beggio,it
67,2057643,cercare capire funzionare twitter,it
91,2082733,tagliare capello,it
...,...,...,...
9987,173390792,motokrzr,it
9989,173443782,rosso spellare peperona,it
9991,173610562,luna,it
9996,174264072,buondì twitters,it


In [25]:
df = lemmatizationStanza_df(df)


Time taken for 'Stanza': 701.40 seconds


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text'] = df['text'].apply(lemmatizationStanza)


In [30]:
df['text'] == dff['text']

8       True
39      True
65      True
67      True
91      True
        ... 
9987    True
9989    True
9991    True
9996    True
9999    True
Name: text, Length: 5766, dtype: bool

In [2]:
with open('/g100_work/IscrC_mental/data/user_classification/user_age_gender_location_test_set.pkl', 'rb') as file:
    data_test = pickle.load(file)

In [5]:
tweets = list(data_test['tweet'])

In [6]:
tweets = [str(tw) for tw in tweets]

In [None]:
### spacy 

In [6]:
nlp = spacy.load('it_core_news_lg')

In [8]:
### for future how to display whole tweets 
pd.set_option('display.max_colwidth', None)

In [9]:
components_to_remove = ['tagger', 'parser', 'attribute_ruler', 'ner']
for component in components_to_remove:
    nlp.remove_pipe(component)

In [10]:
from spacy.language import Language

@Language.component("cleaning")
def cleaning(doc):
    stop_words = spacy.lang.it.stop_words.STOP_WORDS
    new_tokens = [token.text for token in doc if not re.match(r"http\S+|www\S+|https\S+|@\S+|#(?!\w)", token.text) and token.text.lower() not in stop_words]
    return spacy.tokens.Doc(doc.vocab, words=new_tokens)


In [11]:
nlp.add_pipe("cleaning", name="cleaning", first=True)
print(nlp.pipe_names)

['cleaning', 'tok2vec', 'morphologizer', 'lemmatizer']


In [None]:
### trying with different modes - doesn't work

nlp.remove_pipe('lemmatizer')

config = {"mode": "rule", "overwrite": True}
lemmatizer = nlp.add_pipe("lemmatizer", config=config)

doc = nlp(tweets[0])
for token in doc:
    print(f'pos: {token.pos_}, text: {token.text}, lemma: {token.lemma_}')

In [47]:
lemmatized_tweets_spacy = []
for tweet in tweets:
    tokens = nlp(tweet)
    lemmatized_tweet = " ".join([token.lemma_ for token in tokens])
    lemmatized_tweets_spacy.append(lemmatized_tweet)

In [12]:
### the same but with time analysis 
import time
from tqdm.notebook import tqdm

lemmatized_tweets_spacy = []
total_start_time = time.time() 


progress_bar = tqdm(total=len(tweets), desc="Processing tweets", unit="tweet", leave=False)

for tweet in tweets:
    tokens = nlp(tweet)
    lemmatized_tweet = " ".join([token.lemma_ for token in tokens])
    lemmatized_tweets_spacy.append(lemmatized_tweet)
    

    progress_bar.update(1)


progress_bar.close()

total_end_time = time.time() 
total_time = total_end_time - total_start_time
average_time = total_time / len(tweets) if len(tweets) > 0 else 0

print(f"\nTotal time for all iterations: {total_time:.2f} seconds")
print(f"Average time per item: {average_time:.4f} seconds")


Processing tweets:   0%|          | 0/1119 [00:00<?, ?tweet/s]


Total time for all iterations: 4.96 seconds
Average time per item: 0.0044 seconds


In [None]:
#### trying stanza!!!

In [13]:
import stanza

In [14]:
stanza.download('it')

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-13 15:32:08 INFO: Downloading default packages for language: it (Italian) ...
2023-11-13 15:32:09 INFO: File exists: /g100/home/userexternal/ddurmush/stanza_resources/it/default.zip
2023-11-13 15:32:13 INFO: Finished downloading models and saved to /g100/home/userexternal/ddurmush/stanza_resources.


In [15]:
it_nlp = stanza.Pipeline('it', processors='tokenize,mwt,pos,lemma', verbose=False, use_gpu=False)

In [16]:
### lets just clean it here 
## stanza does not have stop words for italian

tweets_no_links_mentions = [re.sub(r"(http\S+|www\S+|https\S+|@\S+)", "", tweet, flags=re.MULTILINE) for tweet in tweets]
stop_words = spacy.lang.it.stop_words.STOP_WORDS

tweets_no_links_mentions_stopwords = []

for tweet in tweets_no_links_mentions:
    filtered_tweet = ' '.join(word for word in tweet.split() if word.lower() not in stop_words)
    tweets_no_links_mentions_stopwords.append(filtered_tweet)
    
tweets_no_hashtags = [
    re.sub(r'#', '', tweet)
    for tweet in tweets_no_links_mentions_stopwords
]

In [13]:
import stanza

class RemoveTweetsMentionsStopwordsProcessor:
    def __init__(self, device, config, pipeline):
        self.stop_words = {'stopword1', 'stopword2', 'stopword3'}  # Add your stop words here

    def _set_up_model(self, *args):
        pass

    def process(self, doc):
        tweets_mentions_stopwords_removed = []

        for sent in doc.sentences:
            cleaned_tokens = []
            for tok in sent.tokens:
                # Check if the token is not a mention, hashtag, or in the stop words
                if (
                    not tok.text.startswith('@')
                    or not tok.text.startswith('#')
                    or tok.text.lower() not in self.stop_words
                ):
                    cleaned_tokens.append(tok.text)

            cleaned_sentence = ' '.join(cleaned_tokens)
            tweets_mentions_stopwords_removed.append(cleaned_sentence)

        # Update the document with the cleaned text
        doc.text = ' '.join(tweets_mentions_stopwords_removed)

        return doc

# Download the Italian models
stanza.download('it')

# Create the pipeline with the custom processor
custom_pipeline = stanza.Pipeline('it', processors={'tokenize': 'it', 'remove_tweets_mentions_stopwords': RemoveTweetsMentionsStopwordsProcessor})

# Process text
text = "This is a @mention and #hashtag example."
doc = custom_pipeline(text)

# Access the cleaned text
cleaned_text = doc.text
print("Cleaned Text:", cleaned_text)


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

2023-11-16 19:42:20 INFO: Downloading default packages for language: it (Italian) ...
2023-11-16 19:42:21 INFO: File exists: /g100/home/userexternal/ddurmush/stanza_resources/it/default.zip
2023-11-16 19:42:24 INFO: Finished downloading models and saved to /g100/home/userexternal/ddurmush/stanza_resources.
2023-11-16 19:42:24 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.6.0.json:   0%|   …

AttributeError: type object 'RemoveTweetsMentionsStopwordsProcessor' has no attribute 'strip'

In [27]:
from stanza.pipeline.processor import ProcessorVariant, register_processor_variant

In [26]:
from stanza.pipeline.processor import Processor
from stanza.pipeline.core import register_processor

@register_processor("remove_tweets_mentions")
class RemoveTweetsMentionsProcessor(Processor):
    ''' Processor that cleanes '''
    _requires = set(['tokenize'])
    _provides = set(['cleaned'])

    def __init__(self, device, config, pipeline):
        pass

    def _set_up_model(self, *args):
        pass

    def process(self, doc):
        tweets_mentions_removed = []

        for sent in doc.sentences:
            cleaned_tokens = []
            for tok in sent.tokens:
                if (not tok.text.startswith('@')
                    or not tok.text.startswith('#')
                    or not re.match(r"http\S+|www\S+|https\S+|@\S+|#(?!\w)", tok.text)
                    or tok.text.lower() not in stop_words):
                    cleaned_tokens.append(tok.text)

            cleaned_sentence = ' '.join(cleaned_tokens)
            tweets_mentions_removed.append(cleaned_sentence)

        doc.text = ' '.join(tweets_mentions_removed)
        
        return doc


ImportError: cannot import name 'register_processor' from 'stanza.pipeline.core' (/g100/home/userexternal/ddurmush/.local/lib/python3.10/site-packages/stanza/pipeline/core.py)

In [None]:
@register_processor("cleaning")
class LowercaseProcessor(Processor):
    ''' Processor that lowercases all text '''
    _requires = set(['tokenize'])
    _provides = set(['lowercase'])

    def __init__(self, device, config, pipeline):
        pass

    def _set_up_model(self, *args):
        pass

    def process(self, doc):
        doc.text = doc.text.lower()
        for sent in doc.sentences:
            for tok in sent.tokens:
                tok.text = tok.text.lower()

            for word in sent.words:
                word.text = word.text.lower()

        return doc


In [None]:
nlp = stanza.Pipeline(dir=TEST_MODELS_DIR, lang='en', processors='tokenize,lowercase')

In [None]:
#### let's run pipeline 

lemmatized_tweets_stanza = []
for tweet in tweets_no_hashtags :
    doc = it_nlp(tweet)
    lemmatized_tweet = ' '.join([word.lemma if word.lemma.endswith((',', '.')) else word.lemma + ' ' for sent in doc.sentences for word in sent.words])
    lemmatized_tweets_stanza.append(lemmatized_tweet)

In [17]:
### pipeline with tdqm because it takes a lot of time to process 

import time
from tqdm.notebook import tqdm

lemmatized_tweets_stanza = []
total_start_time = time.time() 

progress_bar = tqdm(total=len(tweets), desc="Processing tweets", unit="tweet", leave=False)

for tweet in tweets_no_hashtags:
    doc = it_nlp(tweet)
    lemmatized_tweet = ' '.join([word.lemma if word.lemma.endswith((',', '.')) else word.lemma + ' ' for sent in doc.sentences for word in sent.words])
    lemmatized_tweets_stanza.append(lemmatized_tweet)
    

    progress_bar.update(1)


progress_bar.close()

total_end_time = time.time()
total_time = total_end_time - total_start_time
average_time = total_time / len(tweets) if len(tweets) > 0 else 0

print(f"\nTotal time for all iterations: {total_time:.2f} seconds")
print(f"Average time per item: {average_time:.4f} seconds")



Processing tweets:   0%|          | 0/1119 [00:00<?, ?tweet/s]


Total time for all iterations: 167.74 seconds
Average time per item: 0.1499 seconds


In [19]:
df_to_check = pd.DataFrame({'original': tweets, 'lemmatized_spacy':lemmatized_tweets_spacy, 'lemmatized_stanza':lemmatized_tweets_stanza  })
df_to_check.to_csv('lemmas_spacy_stanza.csv')

In [20]:
df_to_check

Unnamed: 0,original,lemmatized_spacy,lemmatized_stanza
0,"@StefanoGuerrera Sono una figlia, da piccola non mi hanno insegnato a baciare sulle labbra, mai fatto. Ora ho 30 anni, qualche volta con mia madre lo facevo, per affetto, ed ora che non c’è più lo rimpiango, mai vergognata e mai avuto problemi di sessualità, se ci fosse ancora lo farei ...","figlio , piccolo insegnato baciare labbro , . 30 , madre , affetto , ci rimpiango , vergognare problema sessualità , ...","figlio , piccolo insegnare baciare labbro , fare . 30 anno , madre fare , affetto , ci essere rimpiangere , vergognare problema sessualità , ..."
1,"Per tutti i followers...... non sono il front man dei Dari, sono un informatico ed ho 42 anni, il Dario che cercate é un mio omonimo.","il followers ...... front man Dari , informatico 42 , Dario cercare é omonimo .","il followers...... front mano Dari , informatico 42 anno , Dario cercare essere omonimo ."
2,@ladyonorato Ho 51 anni e non mi è mai capitato di provare così tanto odio verso il governo!,51 capitare provare odio !,51 capitare provare odio governo !
3,@carmelitadurso ciao barbara ho ho seguito l'intervento di lemme io sono grassa ho 53 anni sono pronta a mettermi in gioco se mi seguì tu,ciao Barbara intervento lemme grasso 53 pronto mettere mi gioco seguire,ciao barbara il intervento lemme grasso 53 pronto mettere mi gioco seguire
4,"@PietroF70 Diplomato alla scuola alberghiera,ho fatto aiuto cuoco, lavapiatti,portiere notturno e istruttore di scuola guida e perché ho 42 anni a casa","diplomato scuola alberghiero , aiuto cuoco , lavapiatto , portiere notturno istruttore scuola guidare 42","diplomare scuola alberghiero , avere aiuto cuoco , lavapiatto , portiere notturno istruttore scuola guida 42"
...,...,...,...
1114,"Mi chiamo Marianna, ho 28 anni e vorrei credere nel mio voto. Ma ahimè, tristemente andrò a votare quello che fa meno schifo, forse.","chare Marianna , 28 volere credere voto . , tristemente andrò votare schifo , .","chiamare Marianna , 28 volere credere voto . ahimè , tristemente andare votare schifo , forse ."
1115,"@civati Populismo di sinistra, populismo di destraAggrappati all’unica vera figura seria, l’attuale presidente del consiglio.Non credo di aver mai letto/sentito da quando sono nato (e ho 47 anni):”facciamo come l’Italia” detto dai tedeschi. Ci vuole preparazione/cultura per fare politica","populismo sinistra , populismo destraAggrappare a il unico vero figura serio , il attuale presidente . credere letto / sentire nato ( 47 anni):”faccare il Italia ” tedesco . volere preparazione / cultura politico","populismo sinistra , populismo destro aggrappare a il unico vero figura serio , il attuale presidente consiglio . non credere letto/sentito nascere ( e 47 anno ) : ”facciare il Italia ” tedere hi . volere preparazione /cultura politico"
1116,,,
1117,@rosita17rosita ho 49anni e vorrei diventare un politico senza compenso solo per risolvere i problemi dell'Italia,49anni volere politico compenso risolvere il problema Italia,49anni volere politico compenso risolvere il problema di il Italia


In [None]:
#trying to find retweets

In [4]:
import sqlite3
conn = sqlite3.connect('/g100_work/IscrC_mental/data/database/MENTALISM.db')
your_query = "SELECT * FROM tweets limit 100000"
df = pd.read_sql_query(your_query, conn)

In [26]:
import re

df['text'] = df['text'].fillna('')
retweet_df = df[df['text'].str.match(r'^RT \w+:')]

In [27]:
retweet_df

Unnamed: 0,tweet_id,user_id,created_at,text,retweet_text,language,likes,retweets


In [92]:
dff = pd.read_csv('lemmas_spacy_stanza_copy.csv')

ParserError: Error tokenizing data. C error: Buffer overflow caught - possible malformed input file.
