In [1]:
import pandas as pd
import numpy as np 
import gensim

from statistics import mean
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

In [2]:
gensim.__version__

'3.7.3'

In [3]:
#Load the Data (this data is not available in the repo)
frame = pd.read_pickle('df_final.pkl')

In [4]:
#Count the number of Articles
print('no. of rows: '+str(len(frame.index)))

no. of rows: 419814


In [5]:
#Articles' attributes
frame.columns

Index(['_id', 'byline.original', 'section_name', 'document_type',
       'headline.main', 'lead_paragraph', 'snippet', 'pub_date', 'word_count',
       'news_desk'],
      dtype='object')

In [6]:
#Making a list of lead pragraphs for each article
paragraphs=frame['lead_paragraph']
documents=[]
for line in paragraphs:
          documents.append(gensim.utils.simple_preprocess(line))



In [7]:
del paragraphs

In [8]:
#Computing number of Distict Words accross the Documents
words=set()
for doc in documents:
    for word in doc:
        if word not in words:
            words.add(word)
len(words)

178728

# Training Doc2vec

In [9]:
doc_sections_pairs=zip(documents,frame['section_name'])

In [10]:
#Create tagged Documents, eliminate empty documents
tagged_documents = [TaggedDocument(doc, i) 
                    for doc, i in doc_sections_pairs if len(doc)>0]

In [11]:
#Sample of tagged paragraph
tagged_documents[:1]

[TaggedDocument(words=['as', 'some', 'of', 'us', 'know', 'from', 'personal', 'experience', 'finding', 'bedbugs', 'in', 'your', 'apartment', 'is', 'jarring', 'while', 'they', 'don', 'carry', 'diseases', 'to', 'humans', 'that', 'we', 'know', 'of', 'their', 'bites', 'are', 'itchy', 'and', 'can', 'get', 'infected'], tags='Real Estate')]

In [12]:
# Train Distributed Bag of Words
#We saved the weights of this model into models/dbow_paragraph.model
dbow = Doc2Vec(tagged_documents, vector_size=150, window=5, dm=0, min_count=2, workers=10, epochs=20)

In [13]:
def get_vectors(tagged_docs):
    targets, vectors = zip(*[(doc.tags, 
                                 dbow.infer_vector(doc.words,epochs=25))
                                for doc in tagged_docs])
    return  vectors, targets

In [14]:
#Get data and labels
X,y=get_vectors(tagged_documents)

In [15]:
#Get rid of raw data
del documents
del frame

In [16]:
#Split into training and test data
X_train, X_test, y_train, y_test= train_test_split(X,y,
                               test_size=0.3, random_state=42)

In [17]:
#After performing a GridSearch
#we concluded that
#the higher the C parameter (weaker regularization)
#produces better accuracy.
#This model was saved into models/Logistic_topic_clasifier_from_paragraph.sav
log = LogisticRegression(C=1e5)
log.fit(X_train, y_train)



LogisticRegression(C=100000.0, class_weight=None, dual=False,
                   fit_intercept=True, intercept_scaling=1, l1_ratio=None,
                   max_iter=100, multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [18]:
y_pred = log.predict(X_test)
print('Accuracy: {}'.format(accuracy_score(y_test, y_pred)))
print('F1 score: {}'.format(f1_score(y_test, y_pred, average='weighted')))

Accuracy: 0.6775403646879268
F1 score: 0.6631370680647448


  'precision', 'predicted', average, warn_for)


In [20]:
#Sort the predictions in terms of probability
y_pred_proba = log.predict_proba(X_test)
best_n = np.argsort(y_pred_proba, axis=1)

In [21]:
#Number of classes on sections
len(best_n[0])

48

In [22]:
#Possible sections
log.classes_

array(['Arts', 'Automobiles', 'Blogs', 'Books', 'Booming', 'Briefing',
       'Business Day', 'Climate', 'Corrections', 'Crosswords & Games',
       'Education', 'Fashion & Style', 'Food',
       'Great Homes & Destinations', 'Health', 'Home & Garden',
       'Job Market', 'Lens', 'Magazine', 'Movies', 'Multimedia/Photos',
       'NYT Now', 'New York', 'Opinion', 'Podcasts', 'Public Editor',
       'Reader Center', 'Real Estate', 'Science', 'Smarter Living',
       'Sports', 'Style', 'Sunday Review', 'T Magazine', 'Technology',
       'The Learning Network', 'The Upshot', 'Theater', 'Times Insider',
       'Today’s Paper', 'Travel', 'U.S.', 'Universal', 'Watching',
       'Week in Review', 'Well', 'World', 'Your Money'], dtype='<U26')

In [23]:
#Number-Section dictionary
classes_dic=zip(range(48),log.classes_)
classDecoder=dict(classes_dic)

In [24]:
#Get the 3 most likely classes by probability
y_pred_prob = log.predict_proba(X_test)
best_n = np.argsort(y_pred_prob, axis=1)
best_n_classes=[[classDecoder[s] for s in b[-3:] ] for b in best_n]

In [25]:
#accuracy of being in best 3 predictions
length=0
successes=0
for y,b in zip(y_test,best_n_classes):
        length+=1
        if y in b:
            successes+=1
successes/length

0.860759900393606