In [1]:
# coding: utf-8
import re
import sys
import numpy as np
import pandas as pd
import time
import datetime
from tqdm import tqdm
from scipy.stats import poisson
from configparser import ConfigParser, ExtendedInterpolation
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
# Gensim
import gensim
import gensim.corpora as corpora
import gensim.models as models

In [2]:
def setup():
    tqdm.pandas()

In [3]:
setup()
config = ConfigParser(inline_comment_prefixes="#;", interpolation=ExtendedInterpolation())
config.read('../config.ini')

['../config.ini']

In [4]:
inputfile = config['Text Cleaning']['tokenized_file']
output = config['General']['output_file']
min_freq = config['General']['min_freq']
max_freq = config['General']['max_freq']
dict_size = config['General']['dict_size']

In [5]:
(inputfile, output)

('./data/tokenized.data', './data/trends.xlsx')

In [6]:
inputfile = '../data/tokenized.data'
output = '../data/trends.xlsx'

In [7]:
def readData(filename):
    print('Reading data....')
    start = time.time()
    df = pd.read_pickle(filename)
    end = time.time()
    print(f'Read finished in {end-start:.2f} seconds.\n')
    return df

In [8]:
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df= readData(inputfile)

Reading data....
Read finished in 3.05 seconds.



In [9]:
df.head()

Unnamed: 0_level_0,index,From,Tweet,Unigrams,Bigrams
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,65285,@SAI,top objectively biggest tech stories of,"[objectively, tech, stories]","[objectively tech, tech stories]"
2010-01-01,195166,@guardiantech,silicon valley campaign seeks startup visa for...,"[silicon, campaign, seeks, startup, visa, fore...","[silicon campaign, campaign seeks, seeks start..."
2010-01-01,502284,@TechCrunch,: my fifth annual list of the tech products i ...,"[fifth, list, tech, products, love, use]","[fifth list, list tech, tech products, product..."
2010-01-01,502285,@TechCrunch,namebench: google % project to find the fastes...,"[namebench, project, find, fastest, dns, server]","[namebench project, project find, find fastest..."
2010-01-01,502286,@TechCrunch,six new years resolutions for apple and the ip...,"[six, resolutions]",[six resolutions]


In [10]:
df = df.reset_index()

In [11]:
df.head()

Unnamed: 0,Date,index,From,Tweet,Unigrams,Bigrams
0,2010-01-01,65285,@SAI,top objectively biggest tech stories of,"[objectively, tech, stories]","[objectively tech, tech stories]"
1,2010-01-01,195166,@guardiantech,silicon valley campaign seeks startup visa for...,"[silicon, campaign, seeks, startup, visa, fore...","[silicon campaign, campaign seeks, seeks start..."
2,2010-01-01,502284,@TechCrunch,: my fifth annual list of the tech products i ...,"[fifth, list, tech, products, love, use]","[fifth list, list tech, tech products, product..."
3,2010-01-01,502285,@TechCrunch,namebench: google % project to find the fastes...,"[namebench, project, find, fastest, dns, server]","[namebench project, project find, find fastest..."
4,2010-01-01,502286,@TechCrunch,six new years resolutions for apple and the ip...,"[six, resolutions]",[six resolutions]


In [13]:
g = df.groupby(['From', 'Date'])['Unigrams'].apply(list)

In [None]:
df = df.set_index(['From', 'Date'])

In [None]:
df.sort_index(inplace=True)

In [None]:
df.head(10)

In [None]:
g = df.reset_index()

In [None]:
g.head()

In [None]:
g['temp']=g['Date'].dt.strftime('%Y-%m-%d')

In [None]:
g['temp']=g['From']+'---'+g['temp']

In [None]:
g.set_index('temp')

In [None]:
h = g.groupby('temp')['Unigrams'].apply(list)

In [None]:
h.head()

In [None]:
b = '@BBCTech---2010-01-01'

In [None]:
h = pd.DataFrame(h)

In [None]:
h.head()

In [None]:
h['From']=h.index

In [None]:
h['Date']= h['From'].apply(lambda x: x.split('---')[1])
h['From']= h['From'].apply(lambda x: x.split('---')[0])
h.head()

In [None]:
h['Date']=pd.to_datetime(h['Date'])

In [None]:
h = h.set_index('Date')


In [None]:
def flatNestedList(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [None]:
h['Unigrams']=h['Unigrams'].apply(flatNestedList)

In [None]:
h.resample('2QS',closed='left', label='left').apply(list)

In [None]:
f.head()

In [None]:
f = f.apply(flatNestedList)

In [None]:
f = pd.DataFrame(f)

In [None]:
f.head()

In [None]:
cv = CountVectorizer(tokenizer=(lambda x: x), preprocessor=(lambda x: x))

In [None]:
table = cv.fit_transform(f['Unigrams'])

In [None]:
fromterm = pd.DataFrame(table.todense())
fromterm.index = f.index
fromterm.columns = cv.get_feature_names()

In [None]:
fromterm = fromterm.T

In [None]:
def getSemesterTermFrequencyMatrixFrom(dataframe, column='Unigrams', min_freq=2, max_freq=500, max_features=100000):
    print('Counting term frequency')
    df = pd.DataFrame(dataframe[column])
    df = df.resample('D',closed='left', label='left').apply(flatNestedList)
    cv = CountVectorizer(tokenizer=(lambda x: x), preprocessor=(lambda x: x), min_df=min_freq, max_df=max_freq)
    table = cv.fit_transform(df[column])
    docterm=pd.DataFrame(table.todense())
    docterm.index = df.index
    semterm = docterm.resample('2QS',closed='left', label='left').sum()
    semterm.columns = cv.get_feature_names()
    semterm=semterm.T
    columns = semterm.columns.strftime(date_format='%Y-%b')
    semterm.columns = np.arange(1,len(semterm.columns)+1).astype(int)
    return semterm, columns

In [None]:
def normalize(df):
    print('Normalizing')
    return df.div(df.sum(axis=0), axis=1)*100000

In [None]:
def getK(df, transform=None, past=3):
    if transform == 'max':
        table = np.zeros(shape=df.shape)
        for i, (index, row) in tqdm(enumerate(df.iterrows())):
            for j in range(len(df.columns)-1):
                table[i, j] = max(row[:j+1])
    if transform == 'mean':
        table = np.zeros(shape=df.shape)
        for i, (index, row) in tqdm(enumerate(df.iterrows())):
            for j in range(len(df.columns)-1):
                bound = max(0, j-past)
                table[i, j] = row[bound:j+1].mean()
        df = pd.DataFrame(table, index=df.index, columns=df.columns)
    return df.loc[:, 1:len(df.columns)-1]

In [None]:
def getPoisson(df, transform=None):
    print('Calculating poisson percentages')
    index = df.index
    columns = df.columns
    p = pd.DataFrame(poisson.cdf(k=getK(df, transform=transform), mu=df.loc[:, 2:len(df.columns)]))
    p.columns = columns[1:]
    p.index = index
    return p

In [None]:
def generateTrends(df, columns, size, threshold):
    print('Creating xls file')
    ll=[]
    for c in df.columns:
        ll.append(np.array(df[df.loc[:,c] < threshold].sort_values(by=[c],ascending=True)[:size].loc[:,c].index))
    trends = pd.DataFrame(ll).T
    trends.columns = columns[1:]
    return trends

In [None]:


for column in ['Unigrams', 'Bigrams']:
    semterm, columns = getSemesterTermFrequencyMatrixFrom(df, column)
    semterm = normalize(semterm)
    p = getPoisson(semterm)
    trends = generateTrends(p, columns, 1000, 0.05)
    trends.to_excel(writer, sheet_name=column)
writer.save()