### Importing the libraries

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### Importing the dataset

In [0]:
dataset_train = pd.read_csv('train.csv')
dataset_test = pd.read_csv('test.csv')

In [80]:
print("No. of train data: ", dataset_train.shape[0])
print("No. of test data: ", dataset_test.shape[0])

No. of train data:  7613
No. of test data:  3263


In [0]:
X_train = dataset_train['text']
X_test = dataset_test['text']
mix = X_train.append(X_test, ignore_index=True)

In [6]:
mix.head

<bound method NDFrame.head of 0        Our Deeds are the Reason of this #earthquake M...
1                   Forest fire near La Ronge Sask. Canada
2        All residents asked to 'shelter in place' are ...
3        13,000 people receive #wildfires evacuation or...
4        Just got sent this photo from Ruby #Alaska as ...
                               ...                        
10871    EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
10872    Storm in RI worse than last hurricane. My city...
10873    Green Line derailment in Chicago http://t.co/U...
10874    MEG issues Hazardous Weather Outlook (HWO) htt...
10875    #CityofCalgary has activated its Municipal Eme...
Name: text, Length: 10876, dtype: object>

In [7]:
mix[0]

'Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all'

In [8]:
Y_train = dataset_train['target']
Y_train.head

<bound method NDFrame.head of 0       1
1       1
2       1
3       1
4       1
       ..
7608    1
7609    1
7610    1
7611    1
7612    1
Name: target, Length: 7613, dtype: int64>

In [17]:
print(dataset_train.groupby('target').target.count())

target
0    4342
1    3271
Name: target, dtype: int64


## **Count Vectorizer**

*   It tokenizes all the letters/words with length atleast 2
*   Converts everything to lower case
*   Builds a vocabulary
*   Removes very frequently occuring words









In [0]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(max_features = 20000, stop_words = 'english', ngram_range = (1,2))
mix_vectorized = vect.fit_transform(mix)

In [81]:
vect.get_feature_names()[::5000]

['00', 'en route', 'nankana sahib', 'rap nonprofit']

In [54]:
print(mix_vectorized.toarray())

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [56]:
mix_vectorized.shape

(10876, 20000)

In [0]:
# Splitting Train and Test data
mix_train = mix_vectorized[:7613, :]
mix_test = mix_vectorized[7613: , :]

In [72]:
print("Shape of train data: ", mix_train.shape)
print("Shape of test data: ", mix_test.shape)

Shape of train data:  (7613, 20000)
Shape of test data:  (3263, 20000)


## Using Gradient Boosting

In [74]:
from sklearn.ensemble import GradientBoostingClassifier
classifier = GradientBoostingClassifier(n_estimators = 3000, learning_rate=1.0, max_depth=3, random_state=0)
classifier.fit(mix_train, Y_train)

GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=1.0, loss='deviance', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_impurity_split=None,
                           min_samples_leaf=1, min_samples_split=2,
                           min_weight_fraction_leaf=0.0, n_estimators=3000,
                           n_iter_no_change=None, presort='deprecated',
                           random_state=0, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)

### Predicting the test results

In [0]:
Y_pred = classifier.predict(mix_test)

In [77]:
print(Y_pred)

[1 1 1 ... 1 1 0]


In [0]:
res = pd.DataFrame(Y_pred)
res.columns = ["target"]
res.to_csv("sub_GB.csv")