#### Importing Header Files

In [209]:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.svm import LinearSVC, SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, accuracy_score, classification_report, confusion_matrix
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
import nltk
import re
from sklearn.ensemble import RandomForestClassifier
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

In [210]:
df1 = pd.read_csv('train.csv')
df1.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [211]:
df2 = pd.read_csv('test.csv')
df2.head()

Unnamed: 0,ID,TITLE,ABSTRACT
0,20973,Closed-form Marginal Likelihood in Gamma-Poiss...,We present novel understandings of the Gamma...
1,20974,Laboratory mid-IR spectra of equilibrated and ...,Meteorites contain minerals from Solar Syste...
2,20975,Case For Static AMSDU Aggregation in WLANs,Frame aggregation is a mechanism by which mu...
3,20976,The $Gaia$-ESO Survey: the inner disk intermed...,Milky Way open clusters are very diverse in ...
4,20977,Witness-Functions versus Interpretation-Functi...,Proving that a cryptographic protocol is cor...


In [212]:
df2.shape

(8989, 3)

In [213]:
df1['All_Text'] = df1['TITLE'] + ' ' + df1['ABSTRACT']

In [214]:
df2['All_Text'] = df2['TITLE'] + ' ' + df2['ABSTRACT']

In [215]:
df1['All_Text'] = df1['All_Text'].map(lambda x: re.sub(r'[^\w\s]+', ' ', x)) # remove special characters
df1['All_Text'] = df1['All_Text'].str.replace('\d+', '') # remove digits

In [216]:
df2['All_Text'] = df2['All_Text'].map(lambda x: re.sub(r'[^\w\s]+', ' ', x)) # remove special characters
df2['All_Text'] = df2['All_Text'].str.replace('\d+', '') # remove digits

In [217]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer

porter=PorterStemmer()

def stemSentence(sentence):
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return " ".join(stem_sentence)

In [218]:
# df1['All_Text'] = df1['All_Text'].apply(stemSentence)

In [219]:
df1.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,All_Text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,Reconstructing Subject Specific Effect Maps ...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,Rotation Invariance Neural Network Rotation ...
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0,Spherical polyharmonics and Poisson kernels fo...
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0,A finite element approximation for the stochas...
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0,Comparative study of Discrete Wavelet Transfor...


In [220]:
# df2['All_Text'] = df2['All_Text'].apply(stemSentence)

In [221]:
w_tokenizer = nltk.tokenize.WhitespaceTokenizer()
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatize_text(text):
    return [" ".join(lemmatizer.lemmatize(w) for w in w_tokenizer.tokenize(text))]

In [222]:
# df1['All_Text'] =  df1['All_Text'].apply(lemmatize_text)

In [223]:
df1.head(2)

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,All_Text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,Reconstructing Subject Specific Effect Maps ...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,Rotation Invariance Neural Network Rotation ...


In [224]:
# df2['All_Text'] =  df2['All_Text'].apply(lemmatize_text)

In [225]:
# df1['All_Text'] = df1['All_Text'].astype(str)

In [226]:
# df1['All_Text'] = df1['All_Text'].apply(lambda x: x.replace('[', '').replace(']', ''))

In [227]:
# df2['All_Text'] = df2['All_Text'].astype(str)

In [228]:
# df2['All_Text'] = df2['All_Text'].apply(lambda x: x.replace('[', '').replace(']', ''))

In [229]:
df1_bp = df1.copy()

In [230]:
labels = ['Computer Science', 'Physics', 'Mathematics','Statistics','Quantitative Biology', 'Quantitative Finance']

In [231]:
df_train_classes = df1[labels]

In [232]:
# pipeline = Pipeline([
#     ('tfidf', TfidfVectorizer(stop_words='english')),
#     ('clf', KNeighborsClassifier()),
# ])
# parameters = {
#     'tfidf__max_features': [None, 50000, 150000],
#     'tfidf__ngram_range': [(1, 2), (1, 3)],
#     'clf__metric'           : ['minkowski','euclidean'],
#     'clf__weights'          : ['uniform','distance'],
#     'clf__n_neighbors'      : np.arange(5,10)
# }

# grid_search_tune = GridSearchCV(pipeline, parameters, cv=5, verbose=3)

# for label in labels:
#     grid_search_tune.fit(df1['All_Text'], df_train_classes[label])

In [233]:
# print("Best parameters set:")
# print(grid_search_tune.best_estimator_.steps)
# print(grid_search_tune.best_score_)

Best parameters set:
[('tfidf', TfidfVectorizer(ngram_range=(1, 3), stop_words='english')), ('clf', LinearSVC(C=100))]

In [234]:
tfv = TfidfVectorizer(sublinear_tf=True, 
                      norm='l2',
                      encoding='latin-1',
                      ngram_range=(1, 2),
                      stop_words='english')

In [235]:
word_features = tfv.fit_transform(df1['All_Text'])

In [236]:
test_word_features = tfv.transform(df2['All_Text'])

In [237]:
df1.head(2)

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance,All_Text
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0,Reconstructing Subject Specific Effect Maps ...
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0,Rotation Invariance Neural Network Rotation ...


In [238]:
main_test_ids = df2['ID']

In [239]:
X = df1['All_Text']

In [240]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

for name in labels:
    y = df1[name]
    for train_index, test_index in skf.split(X, y):
        x_train, x_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        x_train = tfv.fit_transform(x_train)
        x_test = tfv.transform(x_test)
        test_data = tfv.transform(df2['All_Text'])
        
        lvc = linearSVC(C=100)

        lvc.fit(x_train, y_train)
        train_pred = lvc.predict(x_train)
        y_pred = lvc.predict(x_test)
        print(name)
        score_train = f1_score(y_train, train_pred, average='micro')
        print('Training Score', score_train)
        score_test = f1_score(y_test, y_pred, average='micro')
        print('Validation Score', score_test)
       
        df2[name+str(min(test_index))] = lvc.predict_proba(test_data)[:,1]

Computer Science
Training Score 1.0
Validation Score 0.8491060786650775
Computer Science
Training Score 1.0
Validation Score 0.8288438617401669
Computer Science
Training Score 1.0
Validation Score 0.8366714353838818
Computer Science
Training Score 1.0
Validation Score 0.8426323319027181
Computer Science
Training Score 1.0
Validation Score 0.8459704339532664
Physics
Training Score 1.0
Validation Score 0.9189511323003575
Physics
Training Score 1.0
Validation Score 0.9094159713945172
Physics
Training Score 0.999940398140422
Validation Score 0.912970910824988
Physics
Training Score 1.0
Validation Score 0.909871244635193
Physics
Training Score 0.999940398140422
Validation Score 0.9017644253695756
Mathematics
Training Score 1.0
Validation Score 0.9039332538736591
Mathematics
Training Score 1.0
Validation Score 0.8867699642431466
Mathematics
Training Score 1.0
Validation Score 0.9012875536480687
Mathematics
Training Score 1.0
Validation Score 0.8965188364329996
Mathematics
Training Score 1.0


In [None]:
df2.head(1)

In [None]:
df2.columns

In [None]:
for label in labels:
    spike_cols = [col for col in df2.columns if label in col]
    df2[label] = df2[spike_cols].sum(axis=1)/5

In [204]:
df2 = df2[['ID', 'Computer Science', 'Physics', 'Mathematics', 'Statistics', 'Quantitative Biology', 'Quantitative Finance']]

In [205]:
df2.head()

Unnamed: 0,ID,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,20973,0.555657,0.0,0.244289,1.0,0.0,0.0
1,20974,0.0,1.0,0.0,0.0,0.0,0.0
2,20975,0.644879,0.066483,0.244297,0.332745,0.088804,0.0
3,20976,0.0,1.0,0.0,0.0,0.0,0.0
4,20977,0.82247,0.0,0.399539,0.0,0.0,0.0


In [206]:
df2.to_csv('Output Files/lvc_ksplit_ensemble.csv', index=False)