In [11]:
import os
import re
import sys
import numpy as np
import pandas as pd
import string
import re
import preprocessor as p
from preprocessor.api import clean

from sklearn.model_selection import train_test_split
# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use','rt'])

In [12]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'trends')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
STATS_PATH = os.path.join(DATA_DIR, 'stats')
os.listdir(SAVE_PATH)[:5]

['lda_test_data',
 '2019-07-01_trends.csv',
 'oo-2019-08-30_trends.csv',
 '2019-07-02_trends.csv',
 'lda_train_data']

In [13]:
def collect_stats(dfs):
    """
    count how many tweet a trend has on per day
    count how many trend a tweet has on per day
    """
    trend_by = dfs.groupby(["trend_date","trend"]).agg({"id": "nunique"}).reset_index()
#     per_day = trend_by.groupby('trend_date')['id'].mean()
    trend_by.to_csv(os.path.join(STATS_PATH, "tweetMatch_per_trend.txt") , index=True)

    
    tweet_by = dfs.groupby(["trend_date","id"]).agg({"trend": "nunique"}).reset_index()
#     per_day = tweet_by.groupby('trend_date')['id'].mean()
    tweet_by.to_csv(os.path.join(STATS_PATH, "trendMatch_per_tweet.txt") , index=True)

In [4]:
def clear_text(dfs):
    """
    clean the digits, punctuation, non-ascii characters
    """
    remove_digits = str.maketrans('', '', string.digits)
    exclude = '[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]'
    non_ascii = re.compile(r'[^\x00-\x7F]+')
    
    dfs['trend'] = dfs['trend'].map(lambda x : x.lower())
    dfs['trend'] = dfs['trend'].map(lambda x : x.translate(remove_digits))
    dfs['trend'] = dfs['trend'].map(lambda x : re.sub(str(exclude), '', x))    


    dfs['text'] = dfs['text'].map(lambda x : x.lower())
    dfs['text'] = dfs['text'].map(lambda x : clean(x))
    dfs['text'] = dfs['text'].map(lambda x : x.translate(remove_digits))
    dfs['text'] = dfs['text'].map(lambda x : re.sub(str(exclude), '', x))    
    dfs['text'] = dfs['text'].map(lambda x : re.sub(non_ascii, '', x))
    
    dfs.drop_duplicates(inplace=True)

In [5]:
def plotting_stats(dfs):
    """
    Plotting stats
    #of tweet per topic per day 
    #of author per topic per day
    """
    tweet_by = dfs.groupby(["trend_date","trend"]).agg({"id": "nunique"}).reset_index()
    author_by = dfs.groupby(["trend_date","trend"]).agg({"author_id": "nunique"}).reset_index()
    
    tweet_by.to_csv(os.path.join(STATS_PATH, "tweetCount_per_trend.txt") , index=True)
    author_by.to_csv(os.path.join(STATS_PATH, "authorCount_per_trend.txt") , index=True)


In [6]:
def prepare_data():
    
    tweets_folder = TWEETS_PATH
    save_folder = SAVE_PATH
    
    files = os.listdir(save_folder)
    files = [file for file in files if file >= start and file <= end and 'csv' in file] 
    dfs = []
    
    for i, file in enumerate(files):
        date = file.split('_')[0]
        print('%d / %d - %s - date: %s' % (i, len(files), file, str(date) ))
        
        df = pd.read_csv(os.path.join(save_folder,file), header=0, usecols=list(range(10)),
                         parse_dates=['trend_date'])
        df = df[df.lang == "en"]
        df.drop(["Unnamed: 0","lang","created_at","match","match_rule"], inplace=True, axis=1)
        df.dropna(inplace=True)
        dfs.append(df)
        
        
    dfs = pd.concat(dfs)
    
    collect_stats(dfs)
    clear_text(dfs)
    plotting_stats(dfs)
    
    dfs_train, dfs_test = train_test_split(dfs, test_size=0.00001)    
    dfs_train.to_csv(os.path.join(SAVE_PATH, "lda_train_data"), index=False)
    dfs_test.to_csv(os.path.join(SAVE_PATH, "lda_test_data"), index=False)
    

    return dfs_train, dfs_test

In [9]:
start ="2019-07-01"
end = "2019-09-02"

dfs_train, dfs_test = prepare_data()
dfs_train.head()

0 / 3 - 2019-07-01_trends.csv - date: 2019-07-01
1 / 3 - 2019-07-02_trends.csv - date: 2019-07-02
2 / 3 - 2019-07-03_trends.csv - date: 2019-07-03


Unnamed: 0,author_id,id,text,trend,trend_date
101445,556419576,1146486272317362187,rt exclusive amber confronts michael about jo...,amber,2019-07-02
121693,1123717960252702725,1146174413240512515,rt loool tommy and mollymae being the only co...,tommy and molly,2019-07-03
210914,788086415456399361,1145891796834226176,rt looks like michael was the one to hit some...,michael,2019-07-02
111908,227442311,1146516530026504192,rt anna my riderrrrrerr whats annas instagram...,anna,2019-07-04
110185,3861291501,1146517381461753859,rt nobody instagram,instagramdown,2019-07-04


In [10]:
dfs_test.head()

Unnamed: 0,author_id,id,text,trend,trend_date
699,1885640413,1145664201295163392,rt info do is eected to be released from the...,thatsokayitskyungsoo,2019-07-01
18196,1099108032,1145972126127730688,rt hows michael gonna keep slagging off amber...,joanna,2019-07-02
92985,3179560841,1146563896297381888,rt danny to curtis if amy had as much class a...,curtis,2019-07-02
109704,1159992264,1146469205677682688,rt retweet if you love your country amp ill d...,independeceday,2019-07-04
158119,958778077567442945,1146659010541707264,rt in conclusion whatsapp and instagram are t...,whatsapp,2019-07-04


In [None]:
# df = pd.read_csv(os.path.join(SAVE_PATH,"2019-09-01_trends.csv"), header=0, parse_dates=['trend_date'])
# df = df[df.lang == "en"]
# df.drop(["Unnamed: 0","lang","created_at","match ","match rule"], inplace=True, axis=1)

# collect_stats(dfs_train)
# clear_text(dfs_train)
# plotting_stats(dfs_train)

In [None]:
# TO CHECK WHAT PERCENTAGE OF THE TWEETS ARE MATCH

# df_ana = pd.read_csv(os.path.join(TWEETS_PATH,"2019-09-01_tweetsevenmorebasic.csv.bz2.bz2"), header=0)
# df_ana.shape
# print(df.shape[0]/df_ana.shape[0])