In [1]:
# coding: utf-8
import re
import sys
import numpy as np
import pandas as pd
import time
from tqdm import tqdm
from scipy.stats import poisson
from configparser import ConfigParser, ExtendedInterpolation
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

In [2]:
def setup():
    tqdm.pandas()

def flatNestedList(list_of_lists):
    return [val for sublist in list_of_lists for val in sublist]

In [3]:
def readData(filename):
    print('Reading data....')
    start = time.time()
    df = pd.read_pickle(filename)
    end = time.time()
    print(f'Read finished in {end-start:.2f} seconds.\n')
    return df

In [4]:
setup()
config = ConfigParser(inline_comment_prefixes="#;", interpolation=ExtendedInterpolation())
config.read('config.ini')
inputfile = config['Text Cleaning']['tokenized_file']
output = config['General']['output_file']
writer = pd.ExcelWriter(output, engine='xlsxwriter')
df= readData(inputfile)

Reading data....
Read finished in 3.32 seconds.



In [5]:
def getSemesterTermFrequencyMatrixFrom(dataframe, column='Unigrams', min_freq=2, max_freq=500, max_features=100000, vocab=None):
    print ('Generating Semester x Term matrix')
    df = pd.DataFrame(dataframe[column])
    df = df.resample('D',closed='left', label='left').apply(flatNestedList)
    cv = CountVectorizer(tokenizer=(lambda x: x), preprocessor=(lambda x: x), vocabulary=vocab, min_df=min_freq, max_df=max_freq, max_features=max_features)
    table = cv.fit_transform(df[column])
    docterm=pd.DataFrame(table.todense())
    docterm.index = df.index
    docterm.columns = cv.get_feature_names()
    semterm = docterm.resample('2QS',closed='left', label='left').sum()
    semterm=semterm.T
    semterm.columns = [ f'{column.year}-{(column.quarter+1)//2}' for column in list(semterm.columns)]
    return semterm, cv.vocabulary_ 

In [7]:
def applyMask(df, mask):
    mask.loc[list(df.index), list(df.columns)]=df
    return mask

In [8]:
def getBoostTerm(df, semterm, vocab):
    print ('Generating Semester x Term x Source matrix')
    mask = pd.DataFrame().reindex_like(semterm)
    mask = mask.fillna(0)
    sources = []
    for source in tqdm(df['From'].unique()):
        s, _ = getSemesterTermFrequencyMatrixFrom(df[df['From']==source], min_freq=1, vocab=vocab)
        s = applyMask(s, mask)
        sources.append(s.to_numpy())
    stack =np.stack(sources)
    u_stack = (stack!=0).astype(int)
    count = semterm.to_numpy()
    sources = u_stack.sum(axis=0)
    boost = (5+count-sources)/(4+count) # smaller is better
    bdf = pd.DataFrame(boost)
    bdf.index = semterm.index
    bdf.columns = semterm.columns
    return bdf

In [10]:
def generateTrends(df, columns, size, threshold):
    print('Creating xls file')
    ll=[]
    for c in df.columns:
        ll.append(np.array(df[df.loc[:,c] < threshold].sort_values(by=[c],ascending=True)[:size].loc[:,c].index))
    trends = pd.DataFrame(ll).T
    trends.columns = columns[1:]
    return trends

In [11]:
def normalize(df):
    print('Normalizing')
    return df.div(df.sum(axis=0), axis=1)*100000

In [12]:
def getPoisson(df, transform=None):
    print ('Calculating poisson percentages')
    index = df.index
    columns = df.columns
    p = pd.DataFrame(poisson.cdf(k=getK(df, transform=transform), mu=df.loc[:,2:len(df.columns)]))
    p.columns = columns[1:]
    p.index = index
    return p

In [13]:
def getK(df, transform=None, past=3):
    if transform=='max':
        table = np.zeros(shape=df.shape)
        for i, (index, row) in tqdm(enumerate(df.iterrows())):
            for j in range(len(df.columns)-1): 
                table[i,j] = max(row[:j+1])
    if transform=='mean':
        table = np.zeros(shape=df.shape)
        for i, (index, row) in tqdm(enumerate(df.iterrows())):
            for j in range(len(df.columns)-1):
                bound = max(0,j-past)
                table[i,j] = row[bound:j+1].mean()
        df = pd.DataFrame(table, index = df.index, columns=df.columns)
        
    return df.loc[:,1:len(df.columns)-1]

In [14]:
start = time.time()
for column in ['Unigrams', 'Bigrams']:
    print(f'Processing {column}')
    semterm, vocab = getSemesterTermFrequencyMatrixFrom(df, column)
    columns = semterm.columns
    semterm = normalize(semterm)
    boost = getBoostTerm(df, semterm, vocab)
    semterm.columns = np.arange(1,len(semterm.columns)+1).astype(int)
    boost.columns = np.arange(1,len(boost.columns)+1).astype(int)
    p = getPoisson(semterm)
    p = p * boost.loc[:,2:]
    trends = generateTrends(p, columns, 1000, 0.05)
    trends.to_excel(writer, sheet_name=column)
writer.save()
end = time.time()
print(f'Excel file generated in {end-start:.2f} seconds.\n')

Processing Unigrams
Generating Semester x Term matrix


  0%|          | 0/18 [00:00<?, ?it/s]

Normalizing
Generating Semester x Term x Source matrix
Generating Semester x Term matrix


  6%|▌         | 1/18 [00:03<00:53,  3.13s/it]

Generating Semester x Term matrix


 11%|█         | 2/18 [00:05<00:48,  3.03s/it]

Generating Semester x Term matrix


 17%|█▋        | 3/18 [00:09<00:45,  3.06s/it]

Generating Semester x Term matrix


 22%|██▏       | 4/18 [00:12<00:45,  3.22s/it]

Generating Semester x Term matrix


 28%|██▊       | 5/18 [00:16<00:44,  3.41s/it]

Generating Semester x Term matrix


 33%|███▎      | 6/18 [00:20<00:41,  3.48s/it]

Generating Semester x Term matrix


 39%|███▉      | 7/18 [00:23<00:39,  3.57s/it]

Generating Semester x Term matrix


 44%|████▍     | 8/18 [00:27<00:37,  3.72s/it]

Generating Semester x Term matrix


 50%|█████     | 9/18 [00:32<00:35,  3.89s/it]

Generating Semester x Term matrix


 56%|█████▌    | 10/18 [00:36<00:31,  3.91s/it]

Generating Semester x Term matrix


 61%|██████    | 11/18 [00:39<00:25,  3.66s/it]

Generating Semester x Term matrix


 67%|██████▋   | 12/18 [00:42<00:20,  3.43s/it]

Generating Semester x Term matrix


 72%|███████▏  | 13/18 [00:44<00:16,  3.21s/it]

Generating Semester x Term matrix


 78%|███████▊  | 14/18 [00:48<00:13,  3.25s/it]

Generating Semester x Term matrix


 83%|████████▎ | 15/18 [00:52<00:10,  3.41s/it]

Generating Semester x Term matrix


 89%|████████▉ | 16/18 [00:55<00:06,  3.40s/it]

Generating Semester x Term matrix


 94%|█████████▍| 17/18 [00:58<00:03,  3.24s/it]

Generating Semester x Term matrix


100%|██████████| 18/18 [00:58<00:00,  2.43s/it]


Calculating poisson percentages
Creating xls file
Processing Bigrams
Generating Semester x Term matrix
Normalizing
Generating Semester x Term x Source matrix


  0%|          | 0/18 [00:00<?, ?it/s]

Generating Semester x Term matrix


  6%|▌         | 1/18 [00:09<02:45,  9.71s/it]

Generating Semester x Term matrix


 11%|█         | 2/18 [00:21<02:44, 10.27s/it]

Generating Semester x Term matrix


 17%|█▋        | 3/18 [00:32<02:36, 10.44s/it]

Generating Semester x Term matrix


 22%|██▏       | 4/18 [00:42<02:24, 10.36s/it]

Generating Semester x Term matrix


 28%|██▊       | 5/18 [00:52<02:15, 10.46s/it]

Generating Semester x Term matrix


 33%|███▎      | 6/18 [01:03<02:06, 10.52s/it]

Generating Semester x Term matrix


 39%|███▉      | 7/18 [01:13<01:54, 10.37s/it]

Generating Semester x Term matrix


 44%|████▍     | 8/18 [01:22<01:38,  9.81s/it]

Generating Semester x Term matrix


 50%|█████     | 9/18 [01:30<01:23,  9.31s/it]

Generating Semester x Term matrix


 56%|█████▌    | 10/18 [01:40<01:15,  9.48s/it]

Generating Semester x Term matrix


 61%|██████    | 11/18 [01:48<01:04,  9.26s/it]

Generating Semester x Term matrix


 67%|██████▋   | 12/18 [02:00<01:00, 10.00s/it]

Generating Semester x Term matrix


 72%|███████▏  | 13/18 [02:13<00:53, 10.76s/it]

Generating Semester x Term matrix


 78%|███████▊  | 14/18 [02:24<00:43, 10.86s/it]

Generating Semester x Term matrix


 83%|████████▎ | 15/18 [02:39<00:36, 12.04s/it]

Generating Semester x Term matrix


 89%|████████▉ | 16/18 [02:48<00:22, 11.17s/it]

Generating Semester x Term matrix


 94%|█████████▍| 17/18 [02:54<00:09,  9.63s/it]

Generating Semester x Term matrix


100%|██████████| 18/18 [02:55<00:00,  7.09s/it]


Calculating poisson percentages
Creating xls file
Excel file generated in 266.04 seconds.

