In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf

from tqdm import tqdm
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import os
import re

In [None]:
os.chdir('/content/drive/MyDrive/data/')
train = pd.read_csv('train.csv', encoding = 'utf-8')
test = pd.read_csv('test_x.csv', encoding = 'utf-8')
sample_submission = pd.read_csv('sample_submission.csv', encoding = 'utf-8')

In [None]:
from nltk.tokenize import word_tokenize
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [None]:
def cleansing(text):
    repl = ''
    pattern = '[^a-zA-z\s]'
    text = re.sub(pattern, repl, text)
    return text

train['text'] = train['text'].map(lambda x: cleansing(x))

In [None]:
train['text'] = train['text'].apply(lambda x: word_tokenize(x))
train['words_count'] = train['text'].apply(lambda x: len(x))
train['text'] = train['text'].apply(lambda x: [s.lower() for s in x if s not in stop_words])
train['stop_words_count'] = train['text'].apply(lambda x: len(x))
train['numerics'] = train['text'].apply(lambda x : len([s for s in x if s.isdigit()]))

In [None]:
import collections
counter0 = collections.Counter()

for tokens in train[train['author']==0]['text']:
    counter0.update(tokens)

counter0.most_common(15)

In [None]:
import collections
counter1 = collections.Counter()

for tokens in train[train['author']==1]['text']:
    counter1.update(tokens)

counter1.most_common(15)

In [None]:
import collections
counter2 = collections.Counter()

for tokens in train[train['author']==2]['text']:
    counter2.update(tokens)

counter2.most_common(15)

In [None]:
import collections
counter3 = collections.Counter()

for tokens in train[train['author']==3]['text']:
    counter3.update(tokens)

counter3.most_common(15)

In [None]:
import collections
counter4 = collections.Counter()

for tokens in train[train['author']==4]['text']:
    counter4.update(tokens)

counter4.most_common(15)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train.drop(['author','index'],1), train['author'])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train['text'])
test_vectors = vectorizer.transform(X_test['text'])
print(train_vectors.shape, test_vectors.shape)

In [None]:
%%time
import lightgbm as lgb

dtrain = lgb.Dataset(train_vectors,y_train)
dtest = lgb.Dataset(test_vectors,y_test)

param_lgb = {
    'max_depth': 5,
    'learning_rate': 0.33,
    'max_bin': 1000,
    'num_leaves' : 100,
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'softmax',
    'num_iter': 1000
    
}

lgb_model = lgb.train(param_lgb,dtrain,valid_sets=dtest)

In [None]:
y_pred = lgb_model.predict(test_vectors)
y_argmax = y_pred.argmax(1)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy=accuracy_score(y_test, y_argmax)
precision = precision_score(y_test, y_argmax, average='micro')
recall = recall_score(y_test, y_argmax, average='micro')
f1 = f1_score(y_test, y_argmax, average='micro')
print('accuracy:',accuracy,'\n','precision:',precision,'\n','recall:',recall,'\n','f1_score:',f1)

In [None]:
lgb.plot_importance(lgb_model)

In [None]:
%%time
import lightgbm as lgb

dtrain = lgb.Dataset(train_vectors,y_train)
dtest = lgb.Dataset(test_vectors,y_test)

param_lgb = {
    'max_depth': 8,
    'learning_rate': 0.33,
    'max_bin': 1000,
    'num_leaves' : 256,
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'softmax',
    'num_iter': 1000
    
}

lgb_model = lgb.train(param_lgb,dtrain,valid_sets=dtest)

In [None]:
y_pred = lgb_model.predict(test_vectors)
y_argmax = y_pred.argmax(1)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy=accuracy_score(y_test, y_argmax)
precision = precision_score(y_test, y_argmax, average='micro')
recall = recall_score(y_test, y_argmax, average='micro')
f1 = f1_score(y_test, y_argmax, average='micro')
print('accuracy:',accuracy,'\n','precision:',precision,'\n','recall:',recall,'\n','f1_score:',f1)

In [None]:
%%time
import lightgbm as lgb

dtrain = lgb.Dataset(train_vectors,y_train)
dtest = lgb.Dataset(test_vectors,y_test)

param_lgb = {
    'max_depth': 9,
    'learning_rate': 0.33,
    'max_bin': 1000,
    'num_leaves' : 512,
    'objective': 'multiclass',
    'num_class': 5,
    'metric': 'softmax',
    'num_iter': 1000
    
}

lgb_model = lgb.train(param_lgb,dtrain,valid_sets=dtest)

In [None]:
y_pred = lgb_model.predict(test_vectors)
y_argmax = y_pred.argmax(1)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
accuracy=accuracy_score(y_test, y_argmax)
precision = precision_score(y_test, y_argmax, average='micro')
recall = recall_score(y_test, y_argmax, average='micro')
f1 = f1_score(y_test, y_argmax, average='micro')
print('accuracy:',accuracy,'\n','precision:',precision,'\n','recall:',recall,'\n','f1_score:',f1)