In [31]:
# -*- coding: utf-8 -*-
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from typing import List
import inspect

def optimize_floats(df: pd.DataFrame) -> pd.DataFrame:
    floats = df.select_dtypes(include=['float64']).columns.tolist()
    df[floats] = df[floats].apply(pd.to_numeric, downcast='float')
    return df


def optimize_ints(df: pd.DataFrame) -> pd.DataFrame:
    ints = df.select_dtypes(include=['int64']).columns.tolist()
    df[ints] = df[ints].apply(pd.to_numeric, downcast='integer')
    return df


def optimize_objects(df: pd.DataFrame, datetime_features: List[str]) -> pd.DataFrame:
    for col in df.select_dtypes(include=['object']):
        if col not in datetime_features:
            num_unique_values = len(df[col].unique())
            num_total_values = len(df[col])
            if float(num_unique_values) / num_total_values < 0.5:
                print(col)
                df[col] = df[col].astype('category')
        else:
            df[col] = pd.to_datetime(df[col])
    return df



def optimize(df: pd.DataFrame, datetime_features: List[str] = []):
    return df
    #return optimize_floats(optimize_ints(optimize_objects(df, datetime_features)))
    return optimize_floats(optimize_ints(df))

data = []

with open("reddit_comments_dbAggrData_fromQuery20200604_135102.dat", mode='rb') as f:
  data = pickle.load(f)

print(f'before filter: {len(data)=}')

ignored = ["!",".",",",":","?",";","&gt","//www"]
limit = 10
data = list(filter(lambda x: x[3]>limit and (x[0] not in ignored) and (x[1] not in ignored), data))
print(f'after filter: {len(data)=}')

df = optimize(pd.DataFrame.from_records(data, columns=['left','right','distance','occurrence']))
df = df[df['occurrence']>10].sort_values('occurrence', ascending=False)

#df['occurrence'] = df['occurrence'].apply(lambda x: x-(x%100))
notsame = df['left']!=df['right']
mean = df['occurrence'].mean()
notRare = df['occurrence']>mean
close = df['distance']==1
result = df[notsame & close]

result.insert(0, "pair", result['left']+" "+result['right'])
print(f'{type(result)=}')
print(result)


before filter: len(data)=2275945
after filter: len(data)=25079
type(result)=<class 'pandas.core.frame.DataFrame'>
           pair left right  distance  occurrence
9534     of the   of   the         1        1131
7185     in the   in   the         1        1125
13556     to be   to    be         1         672
9901     on the   on   the         1         602
13548    to the   to   the         1         596
...         ...  ...   ...       ...         ...
6857   his head  his  head         1          11
6834     him up  him    up         1          11
6831    him but  him   but         1          11
6827     him on  him    on         1          11
6826    him out  him   out         1          11

[5337 rows x 5 columns]


In [32]:
result['pair'].head()

9534     of the
7185     in the
13556     to be
9901     on the
13548    to the
Name: pair, dtype: object

In [33]:
grp = result.groupby('pair')['occurrence'].agg(['sum'])
print(grp.head())

           sum
pair          
&amp nbsp   55
* *Ive      27
* 1         11
*I am       32
*[I am      59


In [45]:
type(grp)

pandas.core.frame.DataFrame

In [53]:
grp.columns

Index(['sum'], dtype='object')

In [65]:
grp.reset_index()

Unnamed: 0,pair,sum
0,&amp nbsp,55
1,* *Ive,27
2,* 1,11
3,*I am,32
4,*[I am,59
...,...,...
3461,┐ ノʘДʘノ┌༼ຈnຈ༽┐,31
3462,✔ |,16
3463,つ ◕_◕,18
3464,ノʘДʘノ┌༼ຈnຈ༽┐ ノ༼༎ຶ_༎ຶノ༽,31


In [62]:
grp.columns.values

array(['sum'], dtype=object)