In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

### Data Format

{
  'year': 2016.0,
  'month': 1.0,
  'data': [{
        'title': 'EXAMPLE TITLE',
        'year': 2016.0,
        'month': 1.0,
        'day': 20.0,
        'tfidf': [['word1', 0.051231], ['word2', 0.031231]],
        'polarity': 0.1231241, 
        'subjectivity': 0.1241231
      }]
}

In [2]:
import os
import math
import json
import pandas as pd
import pyspark
import pyspark
from pyspark import SparkContext
from textblob import TextBlob as tb

import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/willc97/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
os.environ['SPARK_LOCAL_IP'] = 'localhost'
sc = pyspark.SparkContext(appName='News Articles')

sc.stop()

In [17]:
def tf(word, blob):
    return blob.words.count(word) / len(blob.words)

def n_containing(word, bloblist):
    return sum(1 for blob in bloblist if word in blob)

def idf(word, bloblist):
    return math.log(len(bloblist) / (1 + n_containing(word, bloblist)))

def tfidf(word, blob, bloblist):
    return tf(word, blob) * idf(word, bloblist)
    
def strtolower(s):
    return s.lower()

def analyze_news_articles(args):
    input_folder, output_folder, articles_name = args
    articles = os.path.join(input_folder, articles_name)
    
    month_df = pd.read_csv(articles)
    print("Analyzing {name} which has {num_articles} articles".format(name=articles_name, num_articles=len(month_df)))
    
    results = dict()
    results['data'] = list()
    
    for day in range(1, 31+1):
        day_df = month_df.where(month_df.day == day).dropna()
        if len(day_df) == 0:
            continue 
        print("Day {day}: {num_articles} articles".format(day=day, num_articles=len(day_df)))
        titles = list(day_df.title)
        years = list(day_df.year)
        months = list(day_df.month)
        contents = list(day_df.content)
        bloblist = list(map(tb, list(map(strtolower, contents))))
        
        results['year'] = years[0]
        results['month'] = months[0]
        
        for i, blob in enumerate(bloblist):
            scores = {word: tfidf(word, blob, bloblist) for word in blob.words}
            sorted_words = sorted(scores.items(), key=lambda x: x[1], reverse=True)

            result = dict()
            result['title'] = titles[i]
            result['year'] = years[i]
            result['month'] = months[i]
            result['day'] = day

            sentiment = tb(contents[i]).sentiment
            result['polarity'] = sentiment.polarity
            result['subjectivity'] = sentiment.subjectivity
            
            num_dist_words = len(sorted_words)
            result['tdidf'] = sorted_words[:min(30, num_dist_words)]

            results['data'].append(result)
    
    output_name = os.path.join(output_folder, articles_name.replace('.csv', '.json'))
        
    with open(output_name, 'w') as outfile:
        json.dump(results, outfile)
        
    return (output_name, len(results['data']))

In [12]:
input_folder = 'data/filtered/all-the-news/'
output_folder = 'data/tfidf/all-the-news/'
articles_names = os.listdir(input_folder)
arguments = [(input_folder, output_folder, articles_name) for articles_name in articles_names]

# print('Starting analysis')
# process_articles_by_month = sc.parallelize(arguments, len(arguments)).map(analyze_news_articles)
# collect_results_by_month = process_articles_by_month.collect()
# print('Finished analysis')

In [18]:
for argument in arguments[5:]:
    print(analyze_news_articles(argument))

Analyzing articles_2016_12.csv which has 2886 articles
Day 1: 107 articles
Day 2: 140 articles


KeyboardInterrupt: 

In [19]:
import multiprocessing

In [20]:
pool = multiprocessing.Pool(4)

Analyzing articles_2017_07.csv which has 0 articles
Analyzing articles_2015_03.csv which has 490 articles
Day 1: 1 articles
Day 2: 1 articles
Analyzing articles_2016_09.csv which has 2963 articles
Analyzing articles_2016_12.csv which has 2886 articles
Day 1: 116 articles
Day 1: 107 articles
Day 4: 5 articles
Analyzing articles_2017_11.csv which has 0 articles
Analyzing articles_2016_01.csv which has 1766 articles
Day 1: 72 articles
Day 5: 3 articles
Day 6: 20 articles
Day 7: 10 articles
Day 8: 12 articles
Day 9: 21 articles
Day 2: 31 articles
Day 10: 28 articles
Day 3: 57 articles
Day 11: 18 articles
Day 4: 65 articles
Day 12: 22 articles
Day 13: 15 articles
Day 5: 61 articles
Day 14: 15 articles
Day 15: 16 articles
Day 16: 13 articles
Day 17: 22 articles
Day 18: 23 articles
Day 6: 63 articles
Day 19: 25 articles
Day 20: 27 articles
Day 7: 61 articles
Day 21: 10 articles
Day 2: 136 articles
Day 22: 16 articles
Day 23: 20 articles
Day 8: 53 articles
Day 24: 14 articles
Day 25: 22 articl

Day 5: 60 articles
Day 14: 15 articles
Day 15: 2 articles
Day 16: 17 articles
Day 6: 75 articles
Day 17: 1 articles
Day 18: 9 articles
Day 19: 20 articles
Day 20: 15 articles
Day 21: 17 articles
Day 22: 2 articles
Day 23: 1 articles
Day 24: 2 articles
Day 25: 4 articles
Day 26: 9 articles
Day 27: 23 articles
Day 7: 79 articles
Day 28: 13 articles
Analyzing articles_2017_01.csv which has 3655 articles
Day 1: 57 articles
Day 16: 109 articles
Day 2: 107 articles
Day 21: 110 articles
Day 8: 83 articles
Day 9: 79 articles
Day 22: 86 articles
Day 3: 119 articles
Day 17: 124 articles
Day 10: 92 articles
Day 23: 61 articles
Day 11: 52 articles
Day 24: 95 articles
Day 12: 55 articles
Day 18: 64 articles
Day 13: 79 articles
Day 25: 119 articles
Day 4: 151 articles
Day 19: 69 articles
Day 14: 94 articles
Day 20: 120 articles
Day 26: 97 articles
Day 15: 81 articles
Day 16: 92 articles
Day 5: 116 articles
Day 27: 112 articles
Day 17: 117 articles
Day 21: 118 articles
Day 6: 142 articles
Day 28: 91 

In [21]:
results = pool.map(analyze_news_articles, arguments[5:])
pool.close()

In [37]:
%ll data/

total 12
drwxrwxr-x 3 willc97 4096 Dec  9 16:33 [0m[01;34mfiltered[0m/
drwxrwxr-x 4 willc97 4096 Dec  8 20:34 [01;34mraw[0m/
drwxrwxr-x 3 willc97 4096 Dec 10 12:59 [01;34mtfidf[0m/
