In [None]:
import numpy as np
import pandas as pd


In [None]:
!pip install --user nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
df = pd.read_csv('train_data.csv')

In [None]:
solution = pd.read_csv('train_solution.csv')

In [None]:
df = df.merge(solution, on='id')

In [None]:
df.head()

Unnamed: 0,id,message,category
0,271828,Over $616 million in Bitcoin was electrocated ...,1
1,271829,Quiz: Thursday or friday?,0
2,271830,The Australian Revenue Authority will start co...,1
3,271831,Let's continueðŸ˜‰. I present to you my new review,2
4,271832,Here comes your future palette.,2


In [None]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
from string import punctuation

punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [None]:
import re
regex = re.compile("[A-z]+")

def words_only(text, regex=regex):
    try:
        return regex.findall(text)
    except:
        return []

In [None]:
def lemmatize(text):
    try:
        res = " ".join([word.lower() for word in text if word.lower() not in stop and word.lower() not in punctuation])
        return res
    except:
        return " "

In [None]:
def clean_text(text):
    return lemmatize(words_only(text))

In [None]:
from multiprocessing import Pool
from tqdm import tqdm

with Pool(8) as p:
    lemmas = list(tqdm(p.imap(clean_text, df['message']), total=len(df)))

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 3844/3844 [00:01<00:00, 3523.62it/s]


In [None]:
df["lemmas"] = lemmas
df.head()

Unnamed: 0,id,message,category,lemmas
0,271828,Over $616 million in Bitcoin was electrocated ...,1,million bitcoin electrocated september wrapped...
1,271829,Quiz: Thursday or friday?,0,quiz thursday friday
2,271830,The Australian Revenue Authority will start co...,1,australian revenue authority start collecting ...
3,271831,Let's continueðŸ˜‰. I present to you my new review,2,let continue present new review
4,271832,Here comes your future palette.,2,comes future palette


In [None]:
df = df.drop('id', axis='columns')

In [None]:
df = df.drop('message', axis='columns')
df.head()

Unnamed: 0,category,lemmas
0,1,million bitcoin electrocated september wrapped...
1,0,quiz thursday friday
2,1,australian revenue authority start collecting ...
3,2,let continue present new review
4,2,comes future palette


In [None]:
from sklearn.model_selection import train_test_split

X = df.lemmas.tolist()
y = df.category.tolist()

X, y = np.array(X), np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
print ("total train examples %s" % len(y_train))
print ("total test examples %s" % len(y_test))

total train examples 2575
total test examples 1269


In [None]:
with open('train.txt', 'w+') as outfile:
    for i in range(len(X_train)):
        outfile.write('__label__' + str(y_train[i]) + ' '+ X_train[i] + '\n')

with open('test.txt', 'w+') as outfile:
    for i in range(len(X_test)):
        outfile.write('__label__' + str(y_test[i]) + ' ' + X_test[i] + '\n')

In [None]:
! git clone https://github.com/facebookresearch/fastText.git
! pip3 install fastText/.

Cloning into 'fastText'...
remote: Enumerating objects: 3930, done.[K
remote: Counting objects: 100% (943/943), done.[K
remote: Compressing objects: 100% (137/137), done.[K
remote: Total 3930 (delta 854), reused 806 (delta 806), pack-reused 2987[K
Receiving objects: 100% (3930/3930), 8.24 MiB | 19.31 MiB/s, done.
Resolving deltas: 100% (2505/2505), done.
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Processing ./fastText
[33m  DEPRECATION: A future pip version will change local packages to be built in-place without first copying to a temporary directory. We recommend you use --use-feature=in-tree-build to test your packages with this new behavior before it becomes the default.
   pip 21.3 will remove support for this functionality. You can find discussion regarding this at https://github.com/pypa/pip/issues/7555.[0m
Collecting pybind11>=2.2
  Using cached pybind11-2.10.0-py3-none-any.whl (213 kB)
Building wheels for collected p

In [None]:
import fasttext

In [None]:
classifier = fasttext.train_supervised('train.txt', epoch=12, loss='ova')

In [None]:
classifier.test('test.txt')

(1269, 0.8699763593380615, 0.8699763593380615)

In [None]:
df_res = pd.read_csv('test_data.csv')

In [None]:
from multiprocessing import Pool
from tqdm import tqdm

with Pool(8) as p:
    lemmas = list(tqdm(p.imap(clean_text, df_res['message']), total=len(df)))

5927it [00:00, 6395.40it/s]


In [None]:
df_res['lemmas'] = lemmas
df_res.head()

Unnamed: 0,id,message,lemmas
0,275672,But a lot of people have a job fair tonight.,lot people job fair tonight
1,275673,"Also, I got only 4 answers on the google form ...",also got answers google form
2,275674,"Vladimir, when will we have seminar?",vladimir seminar
3,275675,"A couple at 111, too?",couple
4,275676,"It's on Anti-buying. And again, Zara:",anti buying zara


In [None]:
df_res = df_res.drop('id', axis='columns')
df_res = df_res.drop('message', axis='columns')
df_res.head()

Unnamed: 0,lemmas
0,lot people job fair tonight
1,also got answers google form
2,vladimir seminar
3,couple
4,anti buying zara


In [None]:
with open('test2.txt', 'w+') as outfile:
    for i in range(len(df_res)):
        outfile.write(str(df_res.iloc[i]['lemmas']) + '\n')

In [None]:
result = []

In [None]:
for i in tqdm(range(df_res.shape[0])):
  result.append(classifier.predict(df_res.iloc[i]['lemmas'])[0][0][9:])

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 5927/5927 [00:00<00:00, 7334.62it/s]


In [None]:
result[:5]

['2', '0', '0', '0', '0']

In [None]:
sub = pd.read_csv('sample_submission.csv')

In [None]:
sub['category'] = result

In [None]:
sub.to_csv('sample_submission.csv', index=False)

In [None]:
sub.head()

Unnamed: 0,id,category
0,275672,2
1,275673,0
2,275674,0
3,275675,0
4,275676,0
