In [None]:
import pandas as pd
from sklearn.metrics import precision_score
!pip install hazm
!pip install parsivar

In [None]:
data_train_url = 'https://minio.roboepics.com/datasets/25/data_train.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=competitions%2F20240307%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240307T082819Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=4d16cd7e8b4e20b0d5586c18f6ad5249f8427de7dc4f3fe5facffc581741a4fd'
df = pd.read_csv(data_train_url)

In [None]:
data_test_users_url = 'https://minio.roboepics.com/datasets/32/data_test_users.csv?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=competitions%2F20240307%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240307T083616Z&X-Amz-Expires=604800&X-Amz-SignedHeaders=host&X-Amz-Signature=7e2d69bb399818d3de78f6c50ba17f9b4e9c415129711f371c648696a014d4a0'
df_test = pd.read_csv(data_test_users_url)

In [None]:
df.shape, df_test.shape

((90000, 3), (9000, 2))

In [None]:
df.head()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 90000 entries, 0 to 89999
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   id       90000 non-null  int64 
 1   comment  90000 non-null  object
 2   intent   90000 non-null  object
dtypes: int64(1), object(2)
memory usage: 2.1+ MB


In [None]:
df['id'].nunique()

89999

In [None]:
df.duplicated().sum()

1

In [None]:
df['intent'].unique()

array(['1,5', '1,4', '1,3', '1,2', '5', '4', '3', '2', '1'], dtype=object)

In [None]:
# just wanted to see what characters we have in comments
all_comments = ' '.join(df['comment'].astype(str))
unique_characters = set(all_comments)
print(unique_characters), len(unique_characters)

{'ء', '👏', '🤍', '♂', 'ُ', 'ق', 'e', '_', '#', '🙁', '😡', '¡', 'ح', 'w', ';', '۷', '۹', 'g', '؛', '\n', 'ە', 'د', '🏼', '😁', '١', 'T', '😘', '«', '°', '😩', '🤨', '8', '💓', '🥴', 'k', 'h', 'c', '…', '👌', '😅', '🤦', '💜', 'ۀ', 'ك', '🚶', '😀', '۔', '😥', '🥰', 'z', 'ه', '☺', '🤗', 'ا', '?', '🔋', 'أ', '\\', '👎', '😬', 'L', 'ً', '9', '٤', '✔', 'G', '=', 'x', '😮', 'M', '؟', '0', '\u200d', 'K', 'ک', '(', '😢', '😐', 'p', ':', '👩', '🙏', '💙', '😠', '7', '|', ')', '•', 'ے', '🌹', 'ی', '2', '*', 'j', 'W', 'غ', '٪', '٩', '۱', '😶', '٥', 'س', 'E', '😻', 'ع', '💔', '^', '🍓', '😎', '😞', 'ض', '🌶', 'ڗ', 'B', '⅓', 'ظ', '✂', '😊', '💞', 'V', '🌸', 'ص', 'O', '٧', '،', '♀', '👑', '👂', '🖤', 'ٍ', '😄', 'آ', 'v', 'ھ', '😋', 'N', 'o', '۲', 'ش', 'ف', '✌', 'ٌ', '👊', 'م', '&', '۳', '😂', '💗', '🤞', '!', 'J', '♥', 'f', 'C', '👍', 'm', '♡', '😔', '🏻', '»', 'Y', 'إ', 'r', '🎀', '🍫', '😉', '%', '÷', '🤘', '🤔', 'd', 'u', '💐', '۶', '}', '\u200e', '۰', 'S', '😏', 'y', 'q', '\u200c', 'ل', 'ـ', 'ّ', 'b', 's', 'ي', '☹', 't', '❤', 'D', '٦', 'ہ', 'ژ', '️', '٨

(None, 302)

In [None]:
import re
from hazm import Normalizer

normalizer = Normalizer()

def preprocess_text(text):
    text = normalizer.normalize(text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"\d+", "", text)
    text = re.sub(r"\s+", " ", text)
    return text.strip()

df['new_comment'] = df['comment'].apply(preprocess_text)
df_test['new_comment'] = df_test['comment'].apply(preprocess_text)

from parsivar import Normalizer

def normalize_text(text):
    normalizer = Normalizer()
    normalized_text = normalizer.normalize(text)
    return normalized_text

df['new_comment'] = df['new_comment'].apply(normalize_text)
df_test['new_comment'] = df_test['new_comment'].apply(normalize_text)

In [None]:
# Data preprocessing to split labels and convert to one-hot encoding
from sklearn.preprocessing import MultiLabelBinarizer
df['intent'] = df['intent'].str.split(',')
mlb = MultiLabelBinarizer()
y_one_hot = pd.DataFrame(mlb.fit_transform(df['intent']), columns=mlb.classes_)


y_one_hot=y_one_hot.reset_index()

# Combine X features and y one-hot encoded labels
processed_df = pd.concat([df['new_comment'], y_one_hot], axis=1)

In [None]:
processed_df.duplicated().sum()

0

In [None]:
# Split data into X (features) and y (labels)
X = processed_df['new_comment']
y = processed_df.drop(['new_comment','index'], axis=1)

# Split data into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from parsivar import Tokenizer, FindStems
from hazm.utils import stopwords_list

tokenizer = Tokenizer()
stopwords = stopwords_list()
stemmer = FindStems()

def tokenize(text):
    return tokenizer.tokenize_words(text)

def remove_stopwords(tokens):
    return [token for token in tokens if token not in stopwords]

def stemming(tokens):
    return [stemmer.convert_to_stem(word) for word in tokens]

def preprocess_text(text, use_stemming=False):
    tokens = tokenize(text)
    tokens = remove_stopwords(tokens)
    tokens = stemming(tokens)
    return ' '.join(tokens)

def preprocess_series(text_series, use_stemming=False):
    return text_series.apply(lambda x: preprocess_text(x, use_stemming))

X_train = preprocess_series(X_train)
print('Preprocessing for train done.')

X_val = preprocess_series(X_val)
print('Preprocessing for val done.')

X_test = df_test['new_comment']
X_test = preprocess_series(X_test)
print('Preprocessing for test done.')

Preprocessing for train done.
Preprocessing for val done.
Preprocessing for test done.


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features = 100)
X_train_tf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_val_tf = tfidf_vectorizer.transform(X_val).toarray()
X_test_tf = tfidf_vectorizer.transform(X_test).toarray()

from gensim.models import Word2Vec
import numpy as np

sentences = [sentence.split() for sentence in X_train]
w2v_model = Word2Vec(sentences, window=5, min_count=5, workers=4)

def vectorize(sentence):
    words = sentence.split()
    words_vecs = [w2v_model.wv[word] for word in words if word in w2v_model.wv]
    if len(words_vecs) == 0:
        return np.zeros(100)
    words_vecs = np.array(words_vecs)
    return words_vecs.mean(axis=0)

X_train_w2v = np.array([vectorize(sentence) for sentence in X_train])
X_val_w2v = np.array([vectorize(sentence) for sentence in X_val])
X_test_w2v = np.array([vectorize(sentence) for sentence in X_test])

In [None]:
y_train = y_train.values
y_val = y_val.values

In [None]:
#solve a multi-label classification
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC

svc_tf = SVC()
ovr_tf = OneVsRestClassifier(svc_tf)
ovr_tf.fit(X_train_tf, y_train)
y_pred_svm_tf = ovr_tf.predict(X_val_tf)

precision_score(y_val, y_pred_svm_tf, average=None)

array([0.75040669, 0.92961609, 0.89346976, 0.87843137, 0.97602192])

In [None]:
precision_score(y_val, y_pred_svm_tf, average='micro')

0.8459859491167042

In [None]:
svc_w2v = SVC()
ovr_clf = OneVsRestClassifier(svc_w2v)
ovr_clf.fit(X_train_w2v, y_train)
y_pred_svm_w2v = ovr_clf.predict(X_val_w2v)

precision_score(y_val, y_pred_svm_w2v, average=None)

array([0.7857464 , 0.93252189, 0.88883576, 0.89504062, 0.96102289])

In [None]:
precision_score(y_val, y_pred_svm_w2v, average='micro')

0.8638812742225163

In [None]:
predictions = ovr_clf.predict(X_test_w2v)

In [None]:
predict = []
for row in predictions:
    indices = [i+1 for i, val in enumerate(row) if val == 1]
    predict.append(indices)

print(len(predict))

df_test['intent'] = predict

9000


In [None]:
def clean(text):
    cleaned_text = re.sub(r"[\[\]\s]", "", str(text))
    return cleaned_text.strip()

df_test['intent'] = df_test['intent'].apply(clean)
df_test = df_test.drop(['new_comment'], axis=1)

In [None]:
df_test['intent'].unique()

array(['1', '', '1,4', '1,2', '5', '4', '2', '1,3', '1,5', '3', '1,2,5',
       '1,2,4', '1,3,4', '3,4', '1,2,3', '1,4,5', '3,5', '1,3,5', '4,5'],
      dtype=object)

In [None]:
df_test['intent'].value_counts()

1        1667
1,2       987
5         976
1,4       962
1,5       955
4         918
1,3       849
3         793
2         754
           97
1,3,4      11
3,4        11
1,2,5       7
1,4,5       4
1,2,3       3
3,5         3
1,2,4       1
1,3,5       1
4,5         1
Name: intent, dtype: int64

In [None]:
df_test['intent'] = df_test['intent'].replace('', '1')

In [None]:
df_test.head(10)

Unnamed: 0,id,comment,intent
0,15336956,خیلی خوبه عالیه,1
1,15336959,زیبا بود,1
2,15336960,به علت شکیتگی مرجوع کردم,1
3,15336961,هم جعبه ماوس باز شده بود و هم ماوس شکسته بود,14
4,15336964,چراغ قوه اش خوب بود,1
5,15336968,طرحش کاشی نمیشه! ما چند تا خریدیم اما اصلا کنا...,14
6,15336971,بهترین کتابی که تو زندگیم خوندم بود واقعا عالیه,1
7,15336974,جنس پلاستیکش نرمه برای بازی کردن بچه ها خوبه ...,1
8,15336976,خوبه من راضیم,1
9,15336977,فقط جدا نیست باید خودت جداشون کنی,1


In [None]:
df_test.to_csv('result.csv', index=False)

In [None]:
result = pd.read_csv('/content/result.csv')

In [None]:
result.head(10)

Unnamed: 0,id,comment,intent
0,15336956,خیلی خوبه عالیه,1
1,15336959,زیبا بود,1
2,15336960,به علت شکیتگی مرجوع کردم,1
3,15336961,هم جعبه ماوس باز شده بود و هم ماوس شکسته بود,14
4,15336964,چراغ قوه اش خوب بود,1
5,15336968,طرحش کاشی نمیشه! ما چند تا خریدیم اما اصلا کنا...,14
6,15336971,بهترین کتابی که تو زندگیم خوندم بود واقعا عالیه,1
7,15336974,جنس پلاستیکش نرمه برای بازی کردن بچه ها خوبه ...,1
8,15336976,خوبه من راضیم,1
9,15336977,فقط جدا نیست باید خودت جداشون کنی,1
