In [5]:
import csv
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
# Read training data
train_index = list()
y_train = list()    
with open('data/train.csv', 'r') as f:
    for line in f:
        t = line.split(',')
        train_index.append(int(t[0]))
        y_train.append(int(t[1]))

# Read test data
test_index = list()  
with open('data/test.csv', 'r') as f:
    for line in f:
        t = line.split(',')
        test_index.append(int(t[0]))

# Load the textual content of the messages into the dictionary "posts"
posts = dict()
with open('data/posts.tsv', 'r') as f:
    for line in f:
        t = line.split('\t')
        posts[int(t[0])] = t[2][:-1]

# Create 2 lists: one containing the messages of the training set and the other containing the messages of the
# test set
train_posts = [posts[idx] for idx in train_index]
test_posts = [posts[idx] for idx in test_index]

# Create the training matrix. Each row corresponds to a message and each column to a word present in at least 5
# messages of the training set. The value of each entry in a row is equal to the tf-idf weight of that word in the 
# corresponding message 
vectorizer = TfidfVectorizer(stop_words='english',min_df=5)
X_train = vectorizer.fit_transform(train_posts)

# Create the test matrix following the same approach as in the case of the training matrix
X_test = vectorizer.transform(test_posts)

print("Train matrix dimensionality: ", X_train.shape)
print("Test matrix dimensionality: ", X_test.shape)

# Use logistic regression to classify the messages of the test set
clf = LogisticRegression(solver='newton-cg', multi_class='multinomial')
clf.fit(X_train, y_train)
y_pred_train = clf.predict_proba(X_train)
y_pred = clf.predict_proba(X_test)

# Write predictions to a file
with open('text_train_baseline_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for i,idx in enumerate(train_index):
        lst = y_pred_train[i,:].tolist()
        writer.writerow(lst)
        

# Write predictions to a file
with open('text_baseline_submission.csv', 'w') as csvfile:
    writer = csv.writer(csvfile, delimiter=',')
    for i,idx in enumerate(test_index):
        lst = y_pred[i,:].tolist()
        writer.writerow(lst)

Train matrix dimensionality:  (13221, 5671)
Test matrix dimensionality:  (3306, 5671)


In [23]:
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier

In [7]:
X_train_dev, X_test_dev, y_train_dev, y_test_dev = train_test_split(X_train, y_train, test_size=0.2)

In [32]:
clf = RandomForestClassifier(max_depth=103,criterion = 'entropy')
clf.fit(X_train_dev, y_train_dev)
y_pred = clf.predict_proba(X_test_dev)
log = log_loss(y_test_dev,y_pred)
print("log loss lin:", log)

log loss lin: 1.2137472990448082


In [21]:
clf = SVC(gamma='auto',probability = True)
clf.fit(X_train_dev, y_train_dev)
y_pred_dev_SVC = clf.predict_proba(X_test_dev)
log = log_loss(y_test_dev,y_pred_dev_SVC)
print("log loss SVC:", log)

log loss SVC: 1.244777428353459


In [27]:
clf = clf = SGDClassifier(tol=1e-6,loss = 'log',eta0 = 0.0001,learning_rate = 'adaptive',max_iter = 10000)
clf.fit(X_train_dev, y_train_dev)
y_pred_dev_SGD = clf.predict_proba(X_test_dev)
log = log_loss(y_test_dev,y_pred_dev_SGD)
print("log loss SGD:", log)

log loss SGD: 1.3476977634172946




In [28]:
clf = LogisticRegression(solver='newton-cg', multi_class='multinomial')
clf.fit(X_train_dev, y_train_dev)
y_pred_dev_lin = clf.predict_proba(X_test_dev)
log = log_loss(y_test_dev,y_pred_dev_lin)
print("log loss SVC:", log)

log loss SVC: 1.2267951195964124
