In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Input Data Format

{
  'year': 2016.0,
  'month': 1.0,
  'data': [{
        'title': 'EXAMPLE TITLE',
        'year': 2016.0,
        'month': 1.0,
        'day': 20.0,
        'tfidf': [['word1', 0.051231], ['word2', 0.031231]],
        'polarity': 0.1231241, 
        'subjectivity': 0.1241231
      }]
}

In [2]:
import os
import math
import json
import pandas as pd
import pyspark
import pyspark
from cosine_similarity import CosineSimilarity
from pyspark import SparkContext
from glob import glob
import json
import nltk

In [10]:
%ls data/cosineSimilarity/all-the-news/

[0m[01;34mdim50_1[0m/  [01;31mdim50_1.tar.gz[0m  [01;34mdim50_2[0m/


In [19]:
input_folder = 'data/tfidf/all-the-news/'
output_folder = 'data/cosineSimilarity/all-the-news/dim50_2'
dim = 50
vector_model_path = 'glove.6B/glove.6B.{dim}d.txt'.format(dim=dim)

try:
    os.mkdirs(input_folder, exist_ok=True)
except:
    pass

try:
    os.mkdir(output_folder)
except:
    pass

articles_names = glob(os.path.join(input_folder, 'articles_2016*'))

In [20]:
# tokens, cos_sim, input_folder, output_folder, articles_name = args
tokens = ['technology','phone','internet','security','online','economy']
cos_sim = CosineSimilarity(dim, glove_path=vector_model_path)

In [21]:
arguments = [(tokens, cos_sim, input_folder, 
              output_folder, os.path.basename(articles_name)) for articles_name in articles_names]

In [22]:
def cosine_similarity_news_articles(args):
    tokens, cos_sim, input_folder, output_folder, articles_name = args

    articles_path = os.path.join(input_folder, articles_name)
    output_path = os.path.join(output_folder, articles_name.replace('.json', '.csv'))

    print('Analyzing {}'.format(articles_path))

    month_df = pd.DataFrame(columns=['date', 'year', 'month',
                                     'day', 'title',
                                     'polarity', 'subjectivity',
                                     'relatedness_title',
                                     'relatedness_content'])

    with open(articles_path) as f:
        try:
            month = json.load(f)
            month['data'].sort(key=lambda datum: datum['day'])
        except Exception:
            month_df.to_csv(output_path, index=False)
            print('Error in from {}'.format(articles_path))
            return

    vector_compare = cos_sim.sentence_to_vec(tokens)

    for data in month['data']:

        # Get list of tokens for tdidf and title
        tokens_tdidf = [tdidfs[0] for tdidfs in data['tdidf']]
        tokens_title = nltk.word_tokenize(data['title'])

        try:
            vector_tdidf = cos_sim.sentence_to_vec(tokens_tdidf)
            cos_title = cos_sim.cosine(vector_tdidf, vector_compare)
        except Exception:
            cos_title = 0.0

        try:
            vector_title = cos_sim.sentence_to_vec(tokens_title)
            cos_tdidf = cos_sim.cosine(vector_title, vector_compare)
        except Exception:
            cos_tdidf = 0.0

        day_data = {
            'date'                : ['{Y}-{M:02}-{D:02}'.format(Y=int(data['year']), M=int(data['month']), D=int(data['day']))],
            'year'                : [data['year']],
            'month'               : [data['month']],
            'day'                 : [data['day']],
            'title'               : [data['title']],
            'subjectivity'        : [data['subjectivity']],
            'polarity'            : [data['polarity']],
            'relatedness_title'   : [cos_title],
            'relatedness_content' : [cos_tdidf]
        }
        day_df = pd.DataFrame.from_dict(day_data)
        month_df = month_df.append(day_df)

    month_df = month_df.reset_index(drop=True)
    month_df.to_csv(output_path, index=False)

    print('Finished analyzing {} with {} results'.format(articles_path, len(month_df)))
    print(output_path)
    return

In [2]:
print('Starting analysis')
sc = pyspark.SparkContext(appName='News Articles')
process_cosine_by_month = sc.parallelize(arguments, len(arguments)).map(cosine_similarity_news_articles)
collect_cosine_by_month = process_cosine_by_month.collect()
sc.stop()
print('Finished analysis')

Starting analysis
Finished analysis
