In [11]:
import os
import re
import sys
import numpy as np
import pandas as pd
import string
import re
import preprocessor as p
from preprocessor.api import clean

from sklearn.model_selection import train_test_split

In [12]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'trends')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
STATS_PATH = os.path.join(DATA_DIR, 'stats')
os.listdir(SAVE_PATH)[:5]

['lda_test_data',
 '2019-07-01_trends.csv',
 'oo-2019-08-30_trends.csv',
 '2019-07-02_trends.csv',
 'lda_train_data']

In [13]:
def collect_stats(dfs):
    """
    count how many tweet a trend has on per day
    count how many trend a tweet has on per day
    """
    trend_by = dfs.groupby(["trend_date","trend"]).agg({"id": "nunique"}).reset_index()
#     per_day = trend_by.groupby('trend_date')['id'].mean()
    trend_by.to_csv(os.path.join(STATS_PATH, "tweetMatch_per_trend.txt") , index=True)

    
    tweet_by = dfs.groupby(["trend_date","id"]).agg({"trend": "nunique"}).reset_index()
#     per_day = tweet_by.groupby('trend_date')['id'].mean()
    tweet_by.to_csv(os.path.join(STATS_PATH, "trendMatch_per_tweet.txt") , index=True)

In [14]:
def clear_text(dfs):
    """
    clean the digits, punctuation, non-ascii characters
    """
    remove_digits = str.maketrans('', '', string.digits)
    exclude = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    non_ascii = re.compile(r'[^\x00-\x7F]+')
    
    dfs['trend'] = dfs['trend'].map(lambda x : x.lower())
    dfs['trend'] = dfs['trend'].map(lambda x : x.translate(remove_digits))
    dfs['trend'] = dfs['trend'].map(lambda x : re.sub(str(exclude), '', x))    


    dfs['text'] = dfs['text'].map(lambda x : x.lower())
    dfs['text'] = dfs['text'].map(lambda x : clean(x))
    dfs['text'] = dfs['text'].map(lambda x : x.translate(remove_digits))
    dfs['text'] = dfs['text'].map(lambda x : re.sub(str(exclude), '', x))    
    dfs['text'] = dfs['text'].map(lambda x : re.sub(non_ascii, '', x))
    
    dfs.drop_duplicates(inplace=True)

In [15]:
def plotting_stats(dfs):
    """
    Plotting stats
    #of tweet per topic per day 
    #of author per topic per day
    """
    tweet_by = dfs.groupby(["trend_date","trend"]).agg({"id": "nunique"}).reset_index()
    author_by = dfs.groupby(["trend_date","trend"]).agg({"author_id": "nunique"}).reset_index()
    
    tweet_by.to_csv(os.path.join(STATS_PATH, "tweetCount_per_trend.txt") , index=True)
    author_by.to_csv(os.path.join(STATS_PATH, "authorCount_per_trend.txt") , index=True)


In [16]:
def prepare_data():
    
    tweets_folder = TWEETS_PATH
    save_folder = SAVE_PATH
    
    files = os.listdir(save_folder)
    files = [file for file in files if file >= start and file <= end and 'csv' in file] 
    dfs = []
    
    for i, file in enumerate(files):
        date = file.split('_')[0]
        print('%d / %d - %s - date: %s' % (i, len(files), file, str(date) ))
        
        df = pd.read_csv(os.path.join(save_folder,file), header=0, usecols=list(range(10)),
                         parse_dates=['trend_date'])
        df = df[df.lang == "en"]
        df.drop(["Unnamed: 0","lang","created_at","match","match_rule"], inplace=True, axis=1)
        df.dropna(inplace=True)
        dfs.append(df)
        
        
    dfs = pd.concat(dfs)
    
    collect_stats(dfs)
    clear_text(dfs)
    plotting_stats(dfs)
    
    dfs_train, dfs_test = train_test_split(dfs, test_size=0.00001)    
    dfs_train.to_csv(os.path.join(SAVE_PATH, "lda_train_data"), index=False)
    dfs_test.to_csv(os.path.join(SAVE_PATH, "lda_test_data"), index=False)
    

    return dfs_train, dfs_test

In [21]:
# start ="2019-07-01"
# end = "2019-09-02"

# dfs_train, dfs_test = prepare_data()
dfs_train.head()

Unnamed: 0,author_id,id,text,trend,trend_date
126695,3306453554,1146641969067675648,rt muthafuckas in the comic shops talking bou...,zendaya,2019-07-04
92422,50921588,1146032549283729409,any nice words for the family of tyler skaggs or,tyler skaggs,2019-07-01
11717,948449480549920768,1146512050522247168,rt halle bailey has been casted as ariel to p...,halle bailey,2019-07-03
137850,702614012798885888,1146178670429650945,rt no one saw this coming trump nemesis megan...,rapinoe,2019-07-03
48995,828134743094931457,1146261214336471040,rt official mv is out now hashtag trend,lightsiscoming,2019-07-02


In [19]:
dfs_test.head()

Unnamed: 0,author_id,id,text,trend,trend_date
72858,1081429734280507395,1146016388634406912,rt major twist in temple vandalism case after...,chandnichowk,2019-07-02
20825,452343139,1145729338865795074,rt kevin durant after leaving the warriors an...,durant,2019-07-01
19957,30297312,1145732715251150850,rt miami is finalizing a trade to send center...,hassan,2019-07-01
53107,1097771979040407552,1146133296482377728,rt repeat after me jasprit bumrah is the bes...,bumrah,2019-07-02
68394,990817480078577664,1145630219073667073,when is kawhi supposed to announce his decision,kawhi,2019-07-01


In [None]:
# df = pd.read_csv(os.path.join(SAVE_PATH,"2019-09-01_trends.csv"), header=0, parse_dates=['trend_date'])
# df = df[df.lang == "en"]
# df.drop(["Unnamed: 0","lang","created_at","match ","match rule"], inplace=True, axis=1)

# collect_stats(dfs_train)
# clear_text(dfs_train)
# plotting_stats(dfs_train)

In [None]:
# TO CHECK WHAT PERCENTAGE OF THE TWEETS ARE MATCH

# df_ana = pd.read_csv(os.path.join(TWEETS_PATH,"2019-09-01_tweetsevenmorebasic.csv.bz2.bz2"), header=0)
# df_ana.shape
# print(df.shape[0]/df_ana.shape[0])