In [1]:
# coding: utf-8
import re
import sys
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from scipy.stats import poisson
from configparser import ConfigParser, ExtendedInterpolation
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
# Gensim
import gensim
import gensim.corpora as corpora
import gensim.models as models

In [2]:
def setup():
    tqdm.pandas()

def flatNestedList(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

def getSemesterTermFrequencyMatrixFrom(dataframe, column='Unigrams', min_freq=2, max_freq=500, max_features=100000):
    print('Counting term frequency')
    df = pd.DataFrame(dataframe[column])
    df = df.resample('D',closed='left', label='left').apply(flatNestedList)
    cv = CountVectorizer(tokenizer=(lambda x: x), preprocessor=(lambda x: x), min_df=min_freq, max_df=max_freq)
    table = cv.fit_transform(df[column])
    docterm=pd.DataFrame(table.todense())
    docterm.index = df.index
    semterm = docterm.resample('2QS',closed='left', label='left').sum()
    semterm.columns = cv.get_feature_names()
    semterm=semterm.T
    columns = semterm.columns.strftime(date_format='%Y-%b')
    semterm.columns = np.arange(1,len(semterm.columns)+1).astype(int)
    return semterm, columns

def normalize(df):
    print('Normalizing')
    return df.div(df.sum(axis=0), axis=1)*100000

def getPoisson(df):
    print ('Calculating poisson percentages')
    index = df.index
    columns = df.columns
    p = pd.DataFrame(poisson.cdf(k=df.loc[:,2:len(df.columns)],mu=df.loc[:,1:len(df.columns)-1]))
    p.columns = columns[1:]
    p.index = index
    return p


def generateTrends(df, columns, size, threshold):
    print('Creating xls file')
    ll=[]
    for c in df.columns:
        ll.append(np.array(df[df.loc[:,c] < threshold].sort_values(by=[c],ascending=True)[:size].loc[:,c].index))
    trends = pd.DataFrame(ll).T
    trends.columns = columns[1:]
    return trends

def readData(filename):
    print('Reading data....')
    start = time.time()
    df = pd.read_pickle(filename)
    end = time.time()
    print(f'Read finished in {end-start:.2f} seconds.\n')
    return df

In [3]:
print('Generating Trends')
start = time.time()
setup()

Generating Trends


In [4]:
config = ConfigParser(inline_comment_prefixes="#;", interpolation=ExtendedInterpolation())
config.read('../config.ini')

['../config.ini']

In [5]:
inputfile = config['Text Cleaning']['tokenized_file']
output = config['General']['output_file']

In [6]:
inputfile, output

('./data/tokenized.data', './data/trends.xlsx')

In [7]:
writer = pd.ExcelWriter(output, engine='xlsxwriter')

In [8]:
inputfile = '../data/tokenized.data'
output= '../data/trends.xls'

In [9]:
df= readData(inputfile)

Reading data....
Read finished in 2.95 seconds.



In [10]:
df.head()

Unnamed: 0_level_0,index,From,Tweet,Unigrams,Bigrams
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2010-01-01,65285,@SAI,top objectively biggest tech stories of,"[objectively, tech, stories]","[objectively tech, tech stories]"
2010-01-01,195166,@guardiantech,silicon valley campaign seeks startup visa for...,"[silicon, campaign, seeks, startup, visa, fore...","[silicon campaign, campaign seeks, seeks start..."
2010-01-01,502284,@TechCrunch,: my fifth annual list of the tech products i ...,"[fifth, list, tech, products, love, use]","[fifth list, list tech, tech products, product..."
2010-01-01,502285,@TechCrunch,namebench: google % project to find the fastes...,"[namebench, project, find, fastest, dns, server]","[namebench project, project find, find fastest..."
2010-01-01,502286,@TechCrunch,six new years resolutions for apple and the ip...,"[six, resolutions]",[six resolutions]


In [11]:
# for column in ['Unigrams', 'Bigrams']:
#     semterm, columns = getSemesterTermFrequencyMatrixFrom(df, column)
#     semterm = normalize(semterm)
#     p = getPoisson(semterm)
#     trends = generateTrends(p, columns, 1000, 0.05)
#     trends.to_excel(writer, sheet_name=column)
# end = time.time()
# print(f'Excel file generated in {end-start:.2f} seconds.\n')

In [35]:
column = 'Unigrams'

In [36]:
semterm, columns = getSemesterTermFrequencyMatrixFrom(df, column)
semterm = normalize(semterm)
p = getPoisson(semterm)
# trends = generateTrends(p, columns, 1000, 0.5)
# trends.to_excel(writer, sheet_name=column)

Counting term frequency
Normalizing
Calculating poisson percentages


In [78]:
for c in p.columns:
    print (i)
    a = (p.sort_values(by=[c],ascending=True)[:1000].loc[:,c])
    a = a.reset_index().sort_values(by=['index'])
    a = a.set_index('index')
    print (a['fold':'fold'])

1
Empty DataFrame
Columns: [2]
Index: []
1
Empty DataFrame
Columns: [3]
Index: []
1
Empty DataFrame
Columns: [4]
Index: []
1
Empty DataFrame
Columns: [5]
Index: []
1
Empty DataFrame
Columns: [6]
Index: []
1
Empty DataFrame
Columns: [7]
Index: []
1
Empty DataFrame
Columns: [8]
Index: []
1
Empty DataFrame
Columns: [9]
Index: []
1
Empty DataFrame
Columns: [10]
Index: []
1
Empty DataFrame
Columns: [11]
Index: []
1
Empty DataFrame
Columns: [12]
Index: []
1
Empty DataFrame
Columns: [13]
Index: []
1
Empty DataFrame
Columns: [14]
Index: []
1
Empty DataFrame
Columns: [15]
Index: []
1
Empty DataFrame
Columns: [16]
Index: []
1
Empty DataFrame
Columns: [17]
Index: []
1
            18
index         
fold   0.00001
1
Empty DataFrame
Columns: [19]
Index: []


Unnamed: 0_level_0,11
index,Unnamed: 1_level_1
