In [2]:
import pandas as pd
import os
from tqdm import trange, tqdm

tqdm.pandas(desc="Processing row")

In [4]:
DIR_TWEETS = '../model/raw/'

df = pd.DataFrame()

for file in os.listdir(DIR_TWEETS):
    print(f'Loading {file}...')
    _df = pd.read_csv(DIR_TWEETS + file, encoding='latin-1')
    df = pd.concat([df, _df])
    df.columns = _df.columns
    print(f'Loaded {file} with {len(_df)} tweets (columns: {", ".join(_df.columns)})')


Loading betsentiment-ES-tweets-sentiment-teams.csv...
Loaded betsentiment-ES-tweets-sentiment-teams.csv with 132707 tweets (columns: tweet_date_created, tweet_id, tweet_text, language, sentiment, sentiment_score)
Loading betsentiment-ES-tweets-sentiment-worldcup.csv...
Loaded betsentiment-ES-tweets-sentiment-worldcup.csv with 198460 tweets (columns: tweet_date_created, tweet_id, tweet_text, language, sentiment, sentiment_score)


In [5]:
df.shape

(331167, 6)

In [6]:
scores_names = {
    'Neutral': 'NEU',
    'Positive': 'POS',
    'Negative': 'NEG',
}

def parse_scores_to_columns(_df):

    df_scores = pd.DataFrame(columns=['NEU', 'POS', 'NEG'])

    for _, row in tqdm(_df.iterrows(), total=len(_df), desc='Processing rows'):

        try:
            score_dict = eval(row['sentiment_score'])
        except:
            score_dict = {'Neutral': 0, 'Positive': 0, 'Negative': 0, 'Mixed': 0}

        mixed = score_dict['Mixed'] / 3
        neu = (score_dict['Neutral'] + mixed) / 2
        pos = (score_dict['Positive'] + mixed) / 2
        neg = (score_dict['Negative'] + mixed) / 2

        df_scores.loc[len(df_scores)] = [neu, pos, neg] 

    return df_scores


In [7]:
df_scores = parse_scores_to_columns(df)

Processing rows: 100%|██████████| 331167/331167 [17:45<00:00, 310.85it/s]


In [8]:
# add columns df_scores to df
for column in df_scores.columns:
    df[column] = df_scores[column]

df = df[df['sentiment'] != 'MIXED']
df = df.dropna(subset=['sentiment'])

df_just_negative = df[df['sentiment'] == 'NEGATIVE']
df_just_positive = df[df['sentiment'] == 'POSITIVE']
df_just_neutral = df[df['sentiment'] == 'NEUTRAL']

df_just_negative = df_just_negative.sample(n=31086, random_state=42)
df_just_positive = df_just_positive.sample(n=31086, random_state=42)
df_just_neutral = df_just_neutral.sample(n=31086, random_state=42)

df = pd.concat([df_just_negative, df_just_positive, df_just_neutral])

df['lang'] = 'es'

df.to_csv('../model/tweets_parsed_pruned.csv', index=False)


In [3]:
_df = pd.read_csv('../model/data/parsed/tweets_parsed_pruned.csv')