In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
!pip install seaborn

Defaulting to user installation because normal site-packages is not writeable
Collecting seaborn


In [4]:
#Load first project
import random
import spacy
import re
import seaborn as sns

nlp = spacy.load("en_core_web_sm")

STOPWORDS = nlp.Defaults.stop_words

train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

df = pd.concat([train_data, test_data])

TRAIN_SIZE = len(df)
SEN = df.text.iloc[0]

df.head()

len_train = len(train_data)

2022-01-21 13:53:04.354771: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-01-21 13:53:04.354804: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


ModuleNotFoundError: No module named 'seaborn'

In [None]:
df.target.value_counts()

In [None]:
df.isnull().sum()

In [None]:
df.fillna(" Unknown", inplace=True)

In [None]:
uniq_keyword = df.keyword.unique()
print(uniq_keyword)

In [None]:
# Check the influence by keyword field

g = sns.FacetGrid(df[['keyword','target']].iloc[:len_train], col='keyword',height=2.5, col_wrap=6)
g.map(sns.distplot, 'target',kde=False)

### I can make the conclusion that some keywords have great influence, some not

## My plan

* Preprocess my df(clear, tokenize, create vocab, vectorizing)
* Choose a model
* Fit, predict, get score 

In [None]:
#OneHotEncoding

# one_hot_df = pd.get_dummies(df.keyword, prefix='keyword')

In [None]:
#1step:  prepare text

In [None]:
def get_random_sen(data, num=1):
    
    """
    Generate random examples of sentences
    
    data: pandas series with text
    num: number of examples
    """
    
    size = len(data)
    
    for i in range(num):
        return str(data.iloc[random.randint(0, size)])
        

In [None]:
def create_remove_list(data, punkt=True, at_name=True, hashtag=False, mail=True, stopwords=True, url=True, remove_list=[]):
    
    """
    Remove not require information from sentence
    
    Return str: list of words 
    """
    
    from spacy.matcher import Matcher
    from spacy.tokenizer import Tokenizer
    
    
    matcher = Matcher(nlp.vocab)
    doc = nlp(data, disable=["ner", "parser"])
    
    remove_list = list(remove_list)
    
    patterns = []
    
    if stopwords==True:
        remove_list+=list(STOPWORDS)
    
    if punkt==True:
        patterns.append([{"IS_PUNCT": True}]),   #remove punct token
    
    if at_name==True:
        patterns.append([{"TEXT": {"REGEX": "^@"}}])    #remove at_name
        
    if hashtag==True:
        patterns.append([{"TEXT": {"REGEX": "^#"}}])    #remove hashtag
    
    if url==True:
        patterns.append([{"TEXT": {"REGEX": "^http"}}])      #remove urls
    
    patterns.append([{"TEXT": {"REGEX": "^ "}}])          #remove spaces
    patterns.append([{"TEXT": {"REGEX": "\n"}}])          #remove /n
    patterns.append([{"TEXT": {"REGEX": "\d+"}}])         #remove digits
        
    matcher.add("Removings", patterns)
    
    matches = matcher(doc)
    
    
    
    for match_id, start, end in matches:
        span = doc[start:end]  # The matched span
        remove_list.append(span.text)
    
    
    
    return set(remove_list)
    

In [None]:
def remove_and_tokenize(sen, remove_list):    
    
    """
    Return a list of words in sentence 
    
    """
    
    doc = nlp(sen)
    
    return [str.lower(word.lemma_) for word in doc if (not str.lower(word.text) in remove_list) and (not word.text in remove_list)]
 
    

In [None]:
#Split, because [E088] Text of length 1113253 exceeds maximum of 1000000

text1 = str()      #init all_text variables
text2 = str()
half = len(df['text'])//2

for sen in df['text'].iloc[:half]:
    text1+=sen+' '

for sen in df['text'].iloc[half:]:
    text2+=sen+' '


remove_list1 = create_remove_list(text1)       #Create remove_list from the all text information

remove_list = create_remove_list(text2, remove_list = remove_list1)



In [None]:
%%time
#test result
sen = get_random_sen(df['text'])
print(sen)
remove_and_tokenize(sen, remove_list)


In [None]:
%%time

df['tokens'] = df['text'].apply(lambda x: remove_and_tokenize(x, remove_list))       #Create tokenized version of sentence

In [None]:
def to_string(row):
    """
    convert tokenlist to str
    """
    
    return ' '.join(row)

In [None]:
#add keyword to tokenlist
df['result_text'] = df['tokens'].apply(lambda x: to_string(x))+df['keyword']

In [None]:
#train_test_split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    df['result_text'].iloc[:len(train_data)], df['target'].iloc[:len(train_data)].astype('int'), test_size=0.25, random_state=42, shuffle=True)  #Split only train data

In [None]:
print(len(X_train), len(X_test), len(y_train), len(y_test))

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn import metrics

# Naïve Bayes Model:
text_clf_nb = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', MultinomialNB()),
])

# Linear SVC Model:
text_clf_lsvc = Pipeline([('tfidf', TfidfVectorizer()),
                     ('clf', LinearSVC()),
])

# Train both models on the moviereviews.tsv training set:
text_clf_nb.fit(X_train, y_train)
text_clf_lsvc.fit(X_train, y_train)

In [None]:
predictions1 = text_clf_nb.predict(X_test)

accuracy1 = metrics.accuracy_score(y_test, predictions1)

predictions2 = text_clf_lsvc.predict(X_test)

accuracy2 = metrics.accuracy_score(y_test, predictions2)

print(accuracy1, accuracy2)

In [None]:
f_score1 = metrics.f1_score(y_test, predictions1)
f_score2 = metrics.f1_score(y_test, predictions2)

print(f_score1, f_score2)

In [None]:
def create_submission(model, id_series, X_test):
    
    """
    create submission file
    """
    
    predictions = model.predict(X_test)
    
    result_dataframe = pd.concat([id_series,pd.Series(predictions, name='target')],axis=1, join='inner')
    
    
    return result_dataframe.to_csv('./my_submission.csv', index=False)
    

In [None]:
all_train = pd.concat([X_train, X_test])
all_test = pd.concat([y_train, y_test])

test = df['result_text'].iloc[len(train_data):]

model = text_clf_nb.fit(all_train, all_test)

In [None]:
create_submission(model,test_data['id'], test)