In [257]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import csv
from nltk import ngrams, FreqDist
from dct import category_dct

wnl     = nltk.WordNetLemmatizer()
stemmer = nltk.PorterStemmer()

stop_words = list(stopwords.words('english'))
sns.set()

In [258]:
stop_words[:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [259]:
df = pd.read_csv('keywords.csv')

In [260]:
df.drop_duplicates(inplace=True)

In [261]:
def keyword_lst(keyword):
    tokens = word_tokenize(keyword)
    tokens = [i for i in tokens if i not in stop_words]
    tokens = [stemmer.stem(i) for i in tokens]
    return tokens

In [262]:
df['keyword_lst'] = df['keywords'].apply(keyword_lst)

In [263]:
df.head()

Unnamed: 0,keywords,keyword_lst
0,avoid jet lag flying to usa,"[avoid, jet, lag, fli, usa]"
1,effects of jet lag,"[effect, jet, lag]"
2,how long does jet lag last,"[long, jet, lag, last]"
3,how to deal with jet lag from asia,"[deal, jet, lag, asia]"
4,how to prevent jet lag on long flights,"[prevent, jet, lag, long, flight]"


# Stem dictionaries

In [265]:
def stem_dct(dct):
    return_dct = dict()
    for key, value in dct.items():
        topic_dct = dict()
        for topic, lst in value.items():
            stemmed_lst = list()
            for i in lst:
                tokens = word_tokenize(i)
                tokens = [i for i in tokens if i not in stop_words]
                tokens = [stemmer.stem(i) for i in tokens]
                stemmed_lst.append(tokens)
            topic_dct[topic] = stemmed_lst
        return_dct[key] = topic_dct
    return return_dct

In [266]:
topic_dct_stemmed = stem_dct(category_dct)

# Categorise topics

In [269]:
def categorise_topics(keyword_lst):
    return_dct = dict()
    for key, value in topic_dct_stemmed.items():
        for topic, lst in value.items():
            for i in lst:
                if set(i).issubset(keyword_lst):
                    return_dct[topic] = key
    
    if return_dct:
        return return_dct
    return None

In [270]:
df['topics'] = df['keyword_lst'].apply(categorise_topics)

In [271]:
df.head()

Unnamed: 0,keywords,keyword_lst,topics
0,avoid jet lag flying to usa,"[avoid, jet, lag, fli, usa]",{'jet lag': 'topic'}
1,effects of jet lag,"[effect, jet, lag]",{'jet lag': 'topic'}
2,how long does jet lag last,"[long, jet, lag, last]",{'jet lag': 'topic'}
3,how to deal with jet lag from asia,"[deal, jet, lag, asia]",{'jet lag': 'topic'}
4,how to prevent jet lag on long flights,"[prevent, jet, lag, long, flight]",{'jet lag': 'topic'}


# To aggregate
- groupby on article ID
- do dictionary per article id with coutns of different tags
- put top 5 topics on the summary sheet

# Write file out

In [191]:
df.to_csv('test.csv')