In [1]:
# coding: utf-8
import re
import sys
import numpy as np
import pandas as pd
import time
import datetime
from tqdm import tqdm
from scipy.stats import poisson
from configparser import ConfigParser, ExtendedInterpolation
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
# Gensim
import gensim
import gensim.corpora as corpora
import gensim.models as models

In [2]:
def setup():
    tqdm.pandas()

In [3]:
setup()
config = ConfigParser(inline_comment_prefixes="#;", interpolation=ExtendedInterpolation())
config.read('../config.ini')

['../config.ini']

In [4]:
inputfile = config['Text Cleaning']['tokenized_file']
output = config['General']['output_file']
min_freq = config['General']['min_freq']
max_freq = config['General']['max_freq']
dict_size = config['General']['dict_size']

In [5]:
(inputfile, output)

('./data/tokenized.data', './data/trends.xlsx')

In [6]:
inputfile = '../data/tokenized.data'
output = '../data/trends.xlsx'

In [7]:
def readData(filename):
    print('Reading data....')
    start = time.time()
    df = pd.read_pickle(filename)
    end = time.time()
    print(f'Read finished in {end-start:.2f} seconds.\n')
    return df

In [8]:
df= readData(inputfile)

Reading data....
Read finished in 2.65 seconds.



In [9]:
def flatNestedList(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [10]:
def getSemesterTermFrequencyMatrixFrom(dataframe, column='Unigrams', min_freq=2, max_freq=500, max_features=100000, vocab=None, stops=None):
    print('Counting term frequency')
    df = pd.DataFrame(dataframe[column])
    df = df.resample('D',closed='left', label='left').apply(flatNestedList)
    cv = CountVectorizer(tokenizer=(lambda x: x), preprocessor=(lambda x: x), min_df=min_freq, max_df=max_freq, stop_words=stops, vocabulary=vocab)
    table = cv.fit_transform(df[column])
    vocab = cv.vocabulary_
    stops = cv.get_stop_words()
    docterm=pd.DataFrame(table.todense())
    docterm.index = df.index
    semterm = docterm.resample('2QS',closed='left', label='left').sum()
    semterm.columns = cv.get_feature_names()
    semterm=semterm.T
    columns = semterm.columns.strftime(date_format='%Y-%b')
    semterm.columns = np.arange(1,len(semterm.columns)+1).astype(int)
    return semterm, columns, vocab, stops

In [11]:
semterm, columns, vocab, stops = getSemesterTermFrequencyMatrixFrom(df, 'Bigrams')

Counting term frequency


In [12]:
semterm

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
.\r\n. city,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
.\r\n. diego,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
.\r\n. jose,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0
.\r\n. oklahoma,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0,0,0,0
. . . hopefully,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,0
. . . oh,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
. . . whether,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5,1
/o: announcements,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0
/o: watch,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0
000ft jump,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0


In [None]:
test = getSemesterTermFrequencyMatrixFrom(df[df['From']=='@TechCrunch'], 'Bigrams', vocab=vocab, stops=stops)

In [None]:
test

In [None]:


for column in ['Unigrams', 'Bigrams']:
    semterm, columns = getSemesterTermFrequencyMatrixFrom(df, column)
    semterm = normalize(semterm)
    p = getPoisson(semterm)
    trends = generateTrends(p, columns, 1000, 0.05)
    trends.to_excel(writer, sheet_name=column)
writer.save()

In [None]:
def normalize(df):
    print('Normalizing')
    return df.div(df.sum(axis=0), axis=1)*100000

In [None]:
def getK(df, transform=None, past=3):
    if transform == 'max':
        table = np.zeros(shape=df.shape)
        for i, (index, row) in tqdm(enumerate(df.iterrows())):
            for j in range(len(df.columns)-1):
                table[i, j] = max(row[:j+1])
    if transform == 'mean':
        table = np.zeros(shape=df.shape)
        for i, (index, row) in tqdm(enumerate(df.iterrows())):
            for j in range(len(df.columns)-1):
                bound = max(0, j-past)
                table[i, j] = row[bound:j+1].mean()
        df = pd.DataFrame(table, index=df.index, columns=df.columns)
    return df.loc[:, 1:len(df.columns)-1]

In [None]:
def getPoisson(df, transform=None):
    print('Calculating poisson percentages')
    index = df.index
    columns = df.columns
    p = pd.DataFrame(poisson.cdf(k=getK(df, transform=transform), mu=df.loc[:, 2:len(df.columns)]))
    p.columns = columns[1:]
    p.index = index
    return p

In [None]:
writer = pd.ExcelWriter(output, engine='xlsxwriter')

In [None]:
def generateTrends(df, columns, size, threshold):
    print('Creating xls file')
    ll=[]
    for c in df.columns:
        ll.append(np.array(df[df.loc[:,c] < threshold].sort_values(by=[c],ascending=True)[:size].loc[:,c].index))
    trends = pd.DataFrame(ll).T
    trends.columns = columns[1:]
    return trends