In [1]:
import pandas as pd
import random
import os
import numpy as np
from datetime import datetime
import multiprocessing as mp
import time

In [2]:
DATA_DIR = "../Data"
TWEETS_PATH = os.path.join(DATA_DIR, 'tweets')
TREND_PATH = os.path.join(DATA_DIR, 'all_trends_world.csv')
SAVE_PATH = os.path.join(DATA_DIR, 'save')
os.listdir(DATA_DIR)

['2019-08-31_tweetsevenmorebasic.csv',
 'tweets',
 'save',
 'all_trends_world.csv']

In [100]:
e = "ezgi"
e.upper()

'EZGI'

In [235]:
import re

def camel_case_split(onegram):
    match_list = []
    for identifier in set(onegram):    
        matches = re.finditer('.+?(?:(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|$)', identifier)
        match_list += [m.group(0) for m in matches]
    
    return match_list
        
def onegram_augment(onegram):
    
    onegram = set(onegram)
    onegram_up = set([gram.upper() for gram in onegram])
    onegram_lower = set([gram.lower() for gram in onegram])
    
    nohash = set([re.sub('#','', gram) for gram in onegram])
    nohash_up = set([gram.upper() for gram in nohash])
    nohash_lower = set([gram.lower() for gram in nohash])
    
    camelCase = camel_case_split(nohash) 
    camelSplit = set()
    if len(camelCase) !=0 :
        cc_up = set([gram.upper() for gram in camelCase])
        cc_lower = set([gram.lower() for gram in camelCase])

        ccHashed = set(['#'+gram for gram in camelCase])
        ccHashed_up = set(['#'+gram for gram in cc_up])
        ccHashed_lower = set(['#'+gram for gram in cc_lower])
        
        ccHashJoined = set(['#'.join(camelCase)])
        ccHashJoined_up = set(['#'.join(cc_up)])
        ccHashJoined_lower = set(['#'.join(cc_lower)])
        
        
        if len(camelCase) >1:
            camelSplit = set([' '.join(camelCase)])
        
        camelCase = set().union(ccHashed, ccHashed_up, ccHashed_lower,
                        ccHashJoined_up, ccHashJoined_lower, ccHashJoined)
        
      

    
    return onegram.union(onegram_up,onegram_lower, nohash,nohash_up,nohash_lower,camelCase), camelSplit

In [236]:
#print(list(onegram)[100])
#print(list(nonegram)[:10])

#print(list(onegram_augment( [list(onegram)[100]] )))
print(list(onegram_augment( ["#EzgiYuceturk"] )))

[{'#YUCETURK', 'EZGIYUCETURK', 'YUCETURK#EZGI', '#ezgiyuceturk', '#EZGIYUCETURK', 'Ezgi#Yuceturk', 'yuceturk#ezgi', 'ezgiyuceturk', '#EzgiYuceturk', 'EzgiYuceturk', '#EZGI', '#Ezgi', '#yuceturk', '#ezgi', '#Yuceturk'}, {'Ezgi Yuceturk'}]


In [251]:
def index_trends(text, onegram_trend_set, nonegram_trend_set):
    try:
        tokens = text.split(' ')
        trend_set = set()
        
        ####### Match not only the onegram but with augmented set of it  #########
        for onegram in onegram_trend_set:
            onegram_augmented, camel_split = onegram_augment([onegram])  
            onegram_match = set(tokens).intersection(onegram_augmented) 
#             print(onegram_augmented)
#             print(tokens)
#             print(camel_split)
            if len(onegram_match)!= 0:
                trend_set.add(onegram)
            
            if len(camel_split)!=0:
                nonegram_trend_set = nonegram_trend_set.union(camel_split)
                
        ###### Make it Better ##########
        others = set([other for other in nonegram_trend_set if (" " + other + " ") in (" " + text +" ")])
        return trend_set.union(others)

    except:
        print(text)
        return set()
    

In [252]:
# onegram = set( [t for t in tr31 if len(t.split(' '))==1 ])
# nonegram = tr31 - onegram

In [266]:
#print(df.text[665])
index_trends("My life sucks Ezgi Yuceturk forzajuve", set(["#EzgiYuceturk", "#ForzaJuve"]), nonegram)

{'#ForzaJuve', 'Ezgi Yuceturk'}

In [240]:
def expand_trend_set(df, trend_col):
    
    non_list_cols = [col for col in (df.columns) if col != trend_col ]
    df2 = pd.DataFrame(df[trend_col].tolist(), index=[df[col] for col in non_list_cols])\
                    .stack()\
                    .reset_index(name=trend_col)[non_list_cols+[trend_col]]
    return df2

In [45]:
def prepare_data_trend_date_indexed_function(file, candidates):
    
    tweets_folder =  TWEETS_PATH
    save_folder = SAVE_PATH
    
    df = pd.read_csv('%s/%s' % (tweets_folder, file))
    dfs = []
    
    for candidate in candidates:
        df_that_day = pd.DataFrame(df)
        trends_that_day = set(trends[trends.date == candidate]['name'])
        
        if (len(trends_that_day) == 0):
            print('trends for %s not found!' % candidate)
            continue

        
        trends_that_day_onegrams = set([trend for trend in trends_that_day if len(trend.split(' ')) == 1])
        trends_that_day_nonegrams = trends_that_day - trends_that_day_onegrams
        
        ##################################################################################################
        df_that_day['trends'] = df_that_day.text.apply(
            lambda x: index_trends(x, trends_that_day_onegrams, trends_that_day_nonegrams))
        df_that_day = expand_trend_set(df_that_day, 'trends')
        ##################################################################################################
        
        df_that_day['trend_date'] = candidate
        dfs.append(df_that_day)
        
    dfs = pd.concat(dfs)
    new_file = file.split('_')[0] + "_trends.csv"
    dfs.to_csv('%s/%s' % (save_folder, new_file), index=False)


In [46]:
def prepare_data_trend_date_indexed_parallelized():
    
    tweets_folder =  TWEETS_PATH
    save_folder = SAVE_PATH

    files = os.listdir(tweets_folder)
    files = [file for file in files if file >= '2013-07-07' and 'csv' in file] # trends only available after this date
    pool = mp.Pool(mp.cpu_count() - 2)
    
    for i, file in enumerate(files):
        print('%d / %d - %s' % (i, len(files), file))
        date = file.split('_')[0]
        that_day = pd.Timestamp(date).date()
        one_day_before = that_day - pd.Timedelta(days = 1)
        one_day_after = that_day + pd.Timedelta(days = 1)
        candidates = [str(that_day), str(one_day_before), str(one_day_after)]
        pool.apply_async(prepare_data_trend_date_indexed_function, args=(file, candidates))

    pool.close()
    pool.join()
    

In [10]:
def trend_date_parser(d):
    format_in =  "%Y-%m-%d %X"
    format_out = "%Y-%m-%d"
 
    d = datetime.datetime.strptime(d, format_in)
    return d.strftime(format_out)

trend_date_parser("2013-07-07 23:36:32")

'2013-07-07'

In [11]:
trends = pd.read_csv(TREND_PATH, parse_dates=['date'], date_parser=trend_date_parser)
tr31 = set(trends[trends.date == '2019-08-31']['name'])
trends[trends.date == '2019-08-31']

Unnamed: 0,date,duration,name,volume
4173506,2019-08-31,490.0,#الهلال_الرايد,162430.0
4173507,2019-08-31,190.0,코엑스,10596.0
4173508,2019-08-31,100.0,#FoodCPNxTEMPT,99623.0
4173509,2019-08-31,100.0,#あすかなBonDance,27517.0
4173510,2019-08-31,100.0,#اكثر_مطعم_تحبه,0.0
...,...,...,...,...
4174082,2019-08-31,30.0,#Newwieeอยู่นี่,0.0
4174083,2019-08-31,30.0,#استعداداتكم_للمدرسه,0.0
4174084,2019-08-31,30.0,Bill O'Brien,0.0
4174085,2019-08-31,30.0,#ourflowerkai,0.0


In [62]:
df = pd.read_csv('%s/%s' % (TWEETS_PATH, '2019-08-31_tweetsevenmorebasic.csv.bz2.bz2'))

In [48]:
prepare_data_trend_date_indexed_parallelized()