In [1]:
import time
import re
import numpy as np
import pandas as pd
import warnings;warnings.filterwarnings('ignore')
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [2]:
df_train = pd.read_csv('data/train.csv',lineterminator='\n')
df_test = pd.read_csv('data/test.csv',lineterminator='\n')

In [3]:
df_train['label'] = df_train['label'].map({'Negative':0,'Positive':1})
df_train.head()

Unnamed: 0,ID,review,label,Unnamed: 3,Unnamed: 4
0,1,Asif momin hakir nahi fakir hai mera sabr us N...,0,,
1,2,Phely jaa kr Naha ky ao phr mazi ko khechna,0,,
2,3,ye to bilkul thk kaha aur hamra km hi dushmano...,0,,
3,4,Dukh hi dukh Zindhge mein,0,,
4,5,Or ya assa he hotta ha.jassya khamosh samander...,0,,


In [4]:
df_train = df_train.loc[:,:'label']
df_train.head()

Unnamed: 0,ID,review,label
0,1,Asif momin hakir nahi fakir hai mera sabr us N...,0
1,2,Phely jaa kr Naha ky ao phr mazi ko khechna,0
2,3,ye to bilkul thk kaha aur hamra km hi dushmano...,0
3,4,Dukh hi dukh Zindhge mein,0
4,5,Or ya assa he hotta ha.jassya khamosh samander...,0


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11299 entries, 0 to 11298
Data columns (total 3 columns):
ID        11299 non-null object
review    11299 non-null object
label     11299 non-null int64
dtypes: int64(1), object(2)
memory usage: 264.9+ KB


In [7]:
df_train.label.value_counts()

1    6013
0    5286
Name: label, dtype: int64

In [9]:
df_train.isnull().sum()

ID        0
review    0
label     0
dtype: int64

In [10]:
df_test.isnull().sum()

ID        0
review    0
dtype: int64

In [11]:
numpy_array = df_train.as_matrix()
numpy_array_test = df_test.as_matrix()
numpy_array[:4]

array([['1',
        'Asif momin hakir nahi fakir hai mera sabr us NY sameta hai', 0],
       ['2', 'Phely jaa kr Naha ky ao phr mazi ko khechna', 0],
       ['3',
        'ye to bilkul thk kaha aur hamra km hi dushmano ko jalana h', 0],
       ['4', 'Dukh hi dukh Zindhge mein', 0]], dtype=object)

In [12]:
#two commom ways to clean data
def cleaner(word):
  word = re.sub(r'\#\.', '', word)
  word = re.sub(r'\n', '', word)
  word = re.sub(r',', '', word)
  word = re.sub(r'\-', ' ', word)
  word = re.sub(r'\.', '', word)
  word = re.sub(r'\\', ' ', word)
  word = re.sub(r'\\x\.+', '', word)
  word = re.sub(r'\d', '', word)
  word = re.sub(r'^_.', '', word)
  word = re.sub(r'_', ' ', word)
  word = re.sub(r'^ ', '', word)
  word = re.sub(r' $', '', word)
  word = re.sub(r'\?', '', word)
  word = re.sub(r'é', '', word)
  word = re.sub(r'§', '', word)
  word = re.sub(r'¦', '', word)
  word = re.sub(r'æ', '', word)
  word = re.sub(r'\d+', '', word)
  word = re.sub('(.*?)\d+(.*?)', '', word)
  return word.lower()
def hashing(word):
  word = re.sub(r'ain$', r'ein', word)
  word = re.sub(r'ai', r'ae', word)
  word = re.sub(r'ay$', r'e', word)
  word = re.sub(r'ey$', r'e', word)
  word = re.sub(r'ie$', r'y', word)
  word = re.sub(r'^es', r'is', word)
  word = re.sub(r'a+', r'a', word)
  word = re.sub(r'j+', r'j', word)
  word = re.sub(r'd+', r'd', word)
  word = re.sub(r'u', r'o', word)
  word = re.sub(r'o+', r'o', word)
  word = re.sub(r'ee+', r'i', word)
  if not re.match(r'ar', word):
    word = re.sub(r'ar', r'r', word)
  word = re.sub(r'iy+', r'i', word)
  word = re.sub(r'ih+', r'eh', word)
  word = re.sub(r's+', r's', word)
  if re.search(r'[rst]y', 'word') and word[-1] != 'y':
    word = re.sub(r'y', r'i', word)
  if re.search(r'[bcdefghijklmnopqrtuvwxyz]i', word):
    word = re.sub(r'i$', r'y', word)
  if re.search(r'[acefghijlmnoqrstuvwxyz]h', word):
    word = re.sub(r'h', '', word)
  word = re.sub(r'k', r'q', word)
  return word

def array_cleaner(array):
  # X = array
  X = []
  for sentence in array:
    clean_sentence = ''
    words = sentence.split(' ')
    for word in words:
      clean_sentence = clean_sentence +' '+ cleaner(word)
    X.append(clean_sentence)
  return X

In [13]:
X_test = numpy_array_test[:,1]
X_test

array(['Mulazmat ke bahali ke dua farma dein aur koe wzeefa bhee bata dein',
       'Dua farma dain meri sehat k luay aur meray baal girna band ho jaye 1 saal say be inteha gir rahay hain',
       'Tum khabees nahi kutti aurat ho 😂😂😂😈😈', ...,
       'Mullah Umar Ne Afghan Hukomat amp Taliban Muzakrat Ki Himayat Kar Di Afghanistan Se Qabzay K Khatmay K Liye Muzakrat Jaiz Hen Paigham ',
       'Embroidery ki puri ek side pe dhagay nikle hue, fabric is average.',
       'tu marti bht h'], dtype=object)

In [14]:

#test if there are nan 
counter = 1
for sentence in X_test:
    try:
        words = sentence.split(' ')
        counter+=1
    except:
        print(sentence)
        print(counter)

In [15]:

X_train = numpy_array[:, 1]
# Clean X here
X_train = array_cleaner(X_train)
X_test = array_cleaner(X_test)
y_train = numpy_array[:, 2]
X_train[:5]

[' asif momin hakir nahi fakir hai mera sabr us ny sameta hai',
 ' phely jaa kr naha ky ao phr mazi ko khechna',
 ' ye to bilkul thk kaha aur hamra km hi dushmano ko jalana h',
 ' dukh hi dukh zindhge mein',
 ' or ya assa he hotta hajassya khamosh samander key lahryain jb utthyee hain to pir sub kuch bahaa kr lya jathyee hainso be care full']

In [16]:

print(len(X_train))
print(len(X_test))

11299
2712


In [17]:
y_train = np.array(y_train)
y_train = y_train.astype('int8')
y_train[:6]

array([0, 0, 0, 0, 0, 0], dtype=int8)

In [18]:
ngram = 2
vectorizer = TfidfVectorizer(sublinear_tf=True,ngram_range=(1, ngram), max_df=0.5)

In [19]:
X_all = X_train + X_test # Combine both to fit the TFIDF vectorization.
lentrain = len(X_train)

vectorizer.fit(X_all) # This is the slow part!
X_all = vectorizer.transform(X_all)

In [20]:
vectorizer.get_feature_names()[-5:]

['鄭h isnan', '鄭pwa', '鄭pwa yani', '鄭pwayani', '鄭pwayani aal']

In [21]:
X_all.shape

(14011, 135645)

In [22]:
X_train_chuli = X_all[:lentrain] # Separate back into training and test sets. 
X_test_chuli = X_all[lentrain:]
X_train_chuli.shape

(11299, 135645)

In [24]:
from sklearn.model_selection import KFold,StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.linear_model import SGDClassifier as SGD

In [25]:
folds = StratifiedKFold(n_splits=10, shuffle=False, random_state=2019)
oof = np.zeros(X_train_chuli.shape[0])
predictions = np.zeros(X_test_chuli.shape[0])

In [26]:
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train_chuli, y_train)):
    print("Fold :{}".format(fold_ + 1))
    trn_data = X_train_chuli[trn_idx]
    trn_label= y_train[trn_idx]
    val_data = X_train_chuli[val_idx]
    val_label= y_train[val_idx]
    model_SGD = SGD(alpha=0.00001,random_state = 2, shuffle = True, loss = 'log')                      
    model_SGD.fit(trn_data, trn_label) # Fit the model.
    print("auc score: {:<8.5f}".format(metrics.roc_auc_score(val_label, model_SGD.predict_proba(val_data)[:,1])))
    predictions += model_SGD.predict_proba(X_test_chuli)[:,1] / folds.n_splits

Fold :1
auc score: 0.72135 
Fold :2
auc score: 0.79110 
Fold :3
auc score: 0.77758 
Fold :4
auc score: 0.72710 
Fold :5
auc score: 0.76438 
Fold :6
auc score: 0.87402 
Fold :7
auc score: 0.83382 
Fold :8
auc score: 0.88491 
Fold :9
auc score: 0.91795 
Fold :10
auc score: 0.81235 


In [27]:
print(len(predictions))
predictions[:4]

2712


array([0.96284044, 0.82341647, 0.03938631, 0.96956899])

In [28]:
SGD_output = pd.DataFrame({"ID":df_test["ID"], "Pred":predictions})
SGD_output.to_csv('SGD_new.csv', index = False)