In [34]:
# Data pre-processing
# My first attempt at converting the training data into CountVectorizer

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

def create_dataframe(wm, feat_names):
    
    # create an index for each row
    doc_names = ['Doc{:d}'.format(idx) for idx, _ in enumerate(wm)]
    df = pd.DataFrame(data=wm.toarray(), index=doc_names,
                      columns=feat_names)
    return(df)


# Read the training.csv
raw_data = pd.read_csv('training.csv')
df = pd.DataFrame(raw_data)
#print(df.head())

#Define the input (x)
x = df['article_words']
#Define the labelled output (y)
y = df['topic']

#Some data cleaning and preprocessing

#Replace all the commas in the strings with a 'space' and remove the underscores
    # This is required as CountVectorizer splits words with spaces
df['article_words'] = df['article_words'].str.replace(",", " ")
df['article_words'] = df['article_words'].str.replace("_", "")

# Convert the Pandas dataframe to an array
x_array = x.values

# Create and fit the CountVectorizer
count = CountVectorizer()
training_bag_of_words = count.fit_transform(x_array)

#Just some testing
print(x_array)
print(training_bag_of_words.shape)
print(training_bag_of_words)

# create a dataframe from the matrix
df_wc = create_dataframe(training_bag_of_words, count.get_feature_names())
# print(df_wc)

# save word counts by document to separate csv file 
df_wc.to_csv('wordcounts.csv')

# Using TF_IDF transformer to get IDF values, convert to data frame
tfidf_transformer = TfidfTransformer(smooth_idf=True,use_idf=True)
tfidf_transformer.fit(training_bag_of_words)
df_idf = pd.DataFrame(tfidf_transformer.idf_, index=count.get_feature_names(),columns=['tfidf'])
# sort by descending
df_idf = df_idf.sort_values(by=['tfidf'], ascending=False)
# print df_idf
# print(df_idf.shape)

# save idf counts to separate csv file
df_idf.to_csv('idf_values.csv')

#Encode the labelled data
label_enc = preprocessing.LabelEncoder()
label_enc.fit(y)
#print(label_enc.classes_)
y_train = label_enc.transform(y)

#print(y_train.shape)

['open absent cent cent cent stock inflow rate kim end forecast won won won won won myong defend dull limit limit continu continu line invest bank bank bank bank bank peg tuesday unwind back fear due move move foreign foreign wednesday wednesday wednesday hwan firm firm rise level today today dollar dollar dollar dollar dollar dollar dollar dollar belief korean intervent deal deal deal deal buy expect suppl suppl posit monday fact export trad trad fresh domest clos clos clos test mid sale compar'
 'morn stead end end day day day patch patch patch index point kiwi kiwi time busi bank early nz year rang weight bid bid auckland finish pret pret pick surpr trickl dollar dollar deal deal deal overnight high high monday trad trad trad trad clos clos base base sell wellington numb absorb low reason basi zealand newsroom rally rally currenc currenc'
 'socc socc world world recent law fifa fifa fifa fifa stat stat stat stat govern own brazil time footbal body confus spokesm warn year european e