In [1]:
from gensim.models.ldamodel import LdaModel
from gensim.models.coherencemodel import CoherenceModel

In [2]:
# Importing modules
import pandas as pd
import os

In [3]:
def validate(df):
    overall = True
    for index, row in df.iterrows():
        if len(str(row['body'])) == 0 or len(str(row['headline'])) == 0:
            overall = False
            print(f'==== Failed case: {row}')
            
    print(f'==== Overall result: {overall} ====')

In [4]:
def minimize(df, size=10000):
    return df.sample(size)

In [6]:
files = ['2017_1.csv', '2017_2.csv', '2018_1.csv', '2018_2.csv', '2019_1.csv', '2019_2.csv']
path = './data/polusa_balanced/'
for file in files:
    df = pd.read_csv(f'{path}{file}')
#     print(df.keys())
    print(f'Validating file {file}...')
    validate(df)

Validating file 2017_1.csv...
==== Overall result: True ====
Validating file 2017_2.csv...
==== Overall result: True ====
Validating file 2018_1.csv...
==== Overall result: True ====
Validating file 2018_2.csv...
==== Overall result: True ====
Validating file 2019_1.csv...
==== Overall result: True ====
Validating file 2019_2.csv...
==== Overall result: True ====


In [10]:
from datetime import datetime
doc_cnt = 1
out_dir = './data/polusa_raw'
print('Start converting documents to *.txt')
start = datetime.now()
for file in files:
    df = pd.read_csv(f'{path}{file}')
    for index, row in df.iterrows():
        content = row['body']
        with open(f'{out_dir}/document_{str(doc_cnt).zfill(6)}.txt', "w", encoding="utf-8") as f:
            f.write(content)
            f.close()
        doc_cnt = doc_cnt + 1
        if doc_cnt % 10000 == 0:
            print(f'{doc_cnt} documents converted...')
print(f'=== FINISHED === Total of {doc_cnt} documents converted. {datetime.now() - start } elapsed')

Start converting documents to *.txt
10000 documents converted...
20000 documents converted...
30000 documents converted...
40000 documents converted...
50000 documents converted...
60000 documents converted...
70000 documents converted...
80000 documents converted...
90000 documents converted...
100000 documents converted...
110000 documents converted...
120000 documents converted...
130000 documents converted...
140000 documents converted...
150000 documents converted...
160000 documents converted...
170000 documents converted...
180000 documents converted...
190000 documents converted...
200000 documents converted...
210000 documents converted...
220000 documents converted...
230000 documents converted...
240000 documents converted...
250000 documents converted...
260000 documents converted...
270000 documents converted...
280000 documents converted...
290000 documents converted...
300000 documents converted...
310000 documents converted...
320000 documents converted...
330000 docume

In [20]:
dfs = []
for file in files:
    print(f'=== Reading {file} ===')
    df = pd.read_csv(f'{path}{file}')
    dfs.append(minimize(df, size=20000))

=== Reading 2017_1.csv ===
=== Reading 2017_2.csv ===
=== Reading 2018_1.csv ===
=== Reading 2018_2.csv ===
=== Reading 2019_1.csv ===
=== Reading 2019_2.csv ===


In [23]:
print(f'=== Merging all dataframes ===')
merged = pd.concat(dfs, keys=['id'])

=== Merging all dataframes ===


In [26]:
for i in range(len(files)):
    print(f'=== Writing to {files[i]} ===')
    dfs[i].to_csv(files[i], encoding="utf-8")
# merged.to_csv(file_name, encoding="utf-8")
print(f'=== Finished ===')

=== Writing to 2017_1.csv ===
=== Writing to 2017_2.csv ===
=== Writing to 2018_1.csv ===
=== Writing to 2018_2.csv ===
=== Writing to 2019_1.csv ===
=== Writing to 2019_2.csv ===
=== Finished ===
