In [26]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB

## Load Data

In [27]:
train = pd.read_csv('raw_data/fulltrain.csv', header=0, names=['Verdict', 'News'])

## Preprocessing

In [28]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
    "ain't": "am not",
    "aren't": "are not",
    "can't": "cannot",
    "can't've": "cannot have",
    "'cause": "because",
    "could've": "could have",
    "couldn't": "could not",
    "couldn't've": "could not have",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hadn't've": "had not have",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'd've": "he would have",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "i'd": "i would",
    "i'll": "i will",
    "i'm": "i am",
    "i've": "i have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "mayn't": "may not",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "oughtn't": "ought not",
    "shan't": "shall not",
    "sha'n't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are"
}

def preprocess_news(news):
    # Convert to lower case
    news = news.lower()
    
    # Replace contractions with their longer forms 
    news = news.split()
    new_news = []
    for word in news:
        if word in contractions:
            new_news.append(contractions[word])
        else:
            new_news.append(word)
    news = " ".join(new_news)
    return news

def tokenize(text):
    tokens = nltk.WordPunctTokenizer().tokenize(news)
    
    if stopwords is not None:
        word_list = [word for word in word_list if word not in stopwords]
    stemmer = nltk.stem.porter.PorterStemmer()
    word_list = [stemmer.stem(word) for word in word_list]
    lem = nltk.stem.wordnet.WordNetLemmatizer()
    word_list = [lem.lemmatize(word) for word in word_list]
    
    return tokens
    

In [29]:
 train["News_clean"] = train["News"].apply(lambda x: preprocess_news(x))

## Training

In [30]:
model = MultinomialNB()

In [31]:
mask = train['Verdict'] < 3
train = train.loc[mask]
X_train = train['News_clean']
y_train = train['Verdict']
tfidf_transformer = CountVectorizer(stop_words='english')
X_train_tfidf = tfidf_transformer.fit_transform(X_train)
model.fit(X_train_tfidf, y_train)

MultinomialNB()

In [32]:
y_pred = model.predict(X_train_tfidf)

## Results

In [33]:
def generate_result(test, y_pred, filename):
    ''' generate csv file base on the y_pred '''
    test['Verdict'] = pd.Series(y_pred)
    test.drop(columns=['News'], inplace=True)
    test.to_csv(filename, index=False)

In [34]:
# Use f1-macro as the metric
score = f1_score(y_train, y_pred, average='macro')
print('score on validation = {}'.format(score))

score on validation = 0.9750812730068448


In [35]:
# generate prediction on test data
test = pd.read_csv('raw_data/balancedtest.csv', header=0, names=['Verdict', 'News'])
X_test = test['News']
X_test = list(map(preprocess_news, X_test))
X_test_tfidf = tfidf_transformer.transform(X_test)
y_pred = model.predict(X_test_tfidf.toarray())

generate_result(test, y_pred, "news_predicted.csv")