In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

**Read Data**

In [None]:
df_train = pd.read_csv("../input/train.csv")
df_test = pd.read_csv("../input/test.csv")



**A Look on Data**

In [None]:
df_train.head(15)

In [None]:
df_test.head()

In [None]:
df_train.shape

In [None]:
df_train.describe()

**Seperating our Data based on Types**

In [None]:
train_num = df_train.select_dtypes(include=['int']).columns
df_train_num = df_train[train_num]
df_train_num.head()

In [None]:
df_threat = df_train_num.sum()

In [None]:
df_threat.head()

In [None]:
df_threat.values

**VIsualize the Data**

In [None]:
df_threat.plot.bar()
plt.show()

In [None]:
df_train_num.corr()
sns.heatmap(df_train_num.corr(),annot=True)

In [None]:
import nltk
import string
import re
from nltk.stem import WordNetLemmatizer 
  
lemmatizer = WordNetLemmatizer() 

**Data Cleanup**
* Remove Puctuation 
* Tokenization
* Remove Stopword
* Lemmatization

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english") 
print(stop_words)

In [None]:
def clean_data(text):
    text = "".join([char for char in text if char not in string.punctuation])   # Remove Punctuation
    tokens = re.split("\W+",text)                                               # Tokenization
    text = [word for word in tokens if word not in stop_words]                  # Remove Stopword
    text = " ".join([lemmatizer.lemmatize(word) for word in text])               # Lemmatization
    return text

In [None]:
df_train["comment_text"] = df_train["comment_text"].apply(lambda x : clean_data(x))
df_train.head()

In [None]:
df_test["comment_text"] = df_test["comment_text"].apply(lambda x : clean_data(x))
df_test.head()

**Feature Creation**
* Comment Length
* Punctuation percentage

In [None]:
def comment_len(text):
    length = len(text) - text.count(' ')
    return length

In [None]:
def punctuation_percentage(text):
    if (len(text) - text.count(' ')) == 0:
        return 0
    punct_length = sum([1 for char in text if char in string.punctuation])
    punct_percentage =  round(punct_length/(len(text) - text.count(' ')),3)*100
    return punct_percentage
  

In [None]:
df_train["comment_text_length"] = df_train["comment_text"].apply(lambda x : comment_len(x))
df_train["comment_text_Punct_percent"] = df_train["comment_text"].apply(lambda x : punctuation_percentage(x))
df_train[:20:2]

In [None]:
df_test["comment_text_length"] = df_test["comment_text"].apply(lambda x : comment_len(x))
df_test["comment_text_Punct_percent"] = df_test["comment_text"].apply(lambda x : punctuation_percentage(x))

In [None]:
all_text = pd.concat([df_train["comment_text"], df_test["comment_text"]])

**Vectorization**

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
count_vect.fit(all_text)
train_counts = count_vect.transform(df_train["comment_text"][:20])
test_counts = count_vect.transform(df_test["comment_text"][:20])

In [None]:
print(train_counts.shape)
print(test_counts.shape)

In [None]:
count_vect.get_feature_names()

In [None]:
train_counts_df = pd.DataFrame(train_counts.toarray())
train_counts_df.columns = count_vect.get_feature_names()
train_counts_df.head()




In [None]:
test_counts_df = pd.DataFrame(test_counts.toarray())
test_counts_df.columns = count_vect.get_feature_names()

**Creating DataSet**

In [None]:
X_train = pd.concat((df_train["comment_text_length"][:20],df_train["comment_text_Punct_percent"][:20], train_counts_df),axis=1)
X_train.head()


In [None]:
X_test = pd.concat((df_test["comment_text_length"][:20],df_test["comment_text_Punct_percent"][:20], train_counts_df),axis=1)
X_test.head()

In [None]:
target = ["toxic","severe_toxic","obscene","threat","insult","identity_hate"]
Y_train = df_train[target][:20]
Y_train.head()

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [None]:


params = {}
params['learning_rate'] = 0.1
params['boosting_type'] = 'gbdt'
params['objective'] = 'binary'
params['metric'] = 'binary_logloss'
params['sub_feature'] = 0.5
params['num_leaves'] = 10
params['min_data'] = 50
params['max_depth'] = 10




In [None]:
Y_pred=pd.DataFrame()
for class_name in target:
    t_train = Y_train[class_name]
    d_train = lgb.Dataset(X_train, label=t_train)
    clf = lgb.train(params, d_train, 1)
    Y_pred[class_name]=clf.predict(X_test)

In [None]:

Y_pred.head()