<a href="https://colab.research.google.com/github/fatemesima/NLP/blob/main/persian_comments_predict_org.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd

train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')


In [None]:
# getting information about train dataset
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 149400 entries, 0 to 149399
Data columns (total 2 columns):
 #   Column                 Non-Null Count   Dtype 
---  ------                 --------------   ----- 
 0   body                   149400 non-null  object
 1   recommendation_status  149400 non-null  object
dtypes: object(2)
memory usage: 2.3+ MB


In [None]:
# getting information about test dataset
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 2 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   body                   600 non-null    object
 1   recommendation_status  600 non-null    object
dtypes: object(2)
memory usage: 9.5+ KB


In [None]:
# Count the number of data classes in the target column
train_data['recommendation_status'].value_counts()

Unnamed: 0_level_0,count
recommendation_status,Unnamed: 1_level_1
not_recommended,49800
recommended,49800
no_idea,49800


In [None]:
# Convert "recommended" data to 1 and "not_recommended" data to 0.
train_data["recommendation_status"] = train_data["recommendation_status"].map({"no_idea": 2,"recommended": 1, "not_recommended": 0})

In [None]:
# checking the values stored in "recommendation_starus"
train_data["recommendation_status"].unique()

array([0, 1, 2])

In [None]:
train_data["recommendation_status"].value_counts()

Unnamed: 0_level_0,count
recommendation_status,Unnamed: 1_level_1
0,49800
1,49800
2,49800


In [None]:
!pip install hazm



In [None]:
from hazm import Normalizer, word_tokenize, Stemmer, stopwords_list
import re

stopwords = stopwords_list()
normalizer = Normalizer()
stemmer = Stemmer()
punctuations = '''!()-[]{};:'"\,؟ <>./?@#$%^&*_~'''
persian_numbers_regex = r"[۰-۹]+"
latin_numbers_regex = r"[\d]+"
white_space = r" +"

def remove_punctuations(token):
        for c in token:
            if c in punctuations:
                token = token.replace(c, " ")

        return token

def remove_stop_words( words):
        output = []
        for word in words:
            if word not in stopwords:
                output.append(word)
        return output

def preprocess_text(text):
    text = str(text)
    text = normalizer.normalize(text)
    text = re.sub(persian_numbers_regex, '', text)
    text = re.sub(latin_numbers_regex, '', text)
    text = re.sub(' +', ' ', text)  # Replace multiple spaces with single space
    text = text.strip()
    text = re.sub(' +', ' ', text)  # Replace multiple spaces with single space
    tokens = word_tokenize(text)
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    filtered_tokens = remove_stop_words(stemmed_tokens)
    filtered_tokens_pun = [remove_punctuations(token) for token in filtered_tokens]
    filtered_tokens_white_space = [token for token in filtered_tokens_pun if token != white_space]
    final_tokens = [token for token in filtered_tokens_white_space if token not in ["", " "]]
    return filtered_tokens_white_space



In [None]:
exmpale = "من متولد سال ۱۳۷۷ هستم"

In [None]:
preprocess_text(exmpale)

['متولد', 'سال', 'هس']

In [None]:
dataes = train_data['body']

In [None]:
data_processed = dataes.apply(preprocess_text)

In [None]:
train_data["preprocess"] = data_processed
train_data.head()

Unnamed: 0,body,recommendation_status,preprocess
0,جنسش‌خوب‌بود‌خیلی‌بدبدبود,0,[جنسش‌خوب‌بود‌خیلی‌بدبدبود]
1,به کار میاد شک ندارم,1,"[کار, میاد, شک, ندار]"
2,چیزی ک توعکسه واست میفرستن ولی هم جنسش خوب نیس...,2,"[ک, توعکسه, واس, میفرستن, ول, ه, جنس, نیس, ه, ..."
3,رنگش خیلی خوبه . براق هم هست و زود خشک میشه . ...,2,"[رنگ, خیل, خوبه, , براق, ه, هس, زود, خشک, میش..."
4,من مرجوع کردم قسمت پاچه شلوار برام تنگ بود ولی...,2,"[مرجوع, قسم, پاچه, شلوار, برا, تنگ, ول, جنس, ب..."


In [None]:
from gensim.models import Word2Vec
model = Word2Vec(sentences=train_data["preprocess"], vector_size=100, window=5, min_count=1, workers=4)

In [None]:
model.wv.most_similar("دوست")

[('دوستشون', 0.829220175743103),
 ('دوسشون', 0.8061068058013916),
 ('بخشیدم', 0.7751042246818542),
 ('دوس', 0.7571793794631958),
 ('ومشکل', 0.751501739025116),
 ('نیدار', 0.7435619831085205),
 ('stranger', 0.7405935525894165),
 ('عاشقشه', 0.7400643825531006),
 ('رضایتبخ', 0.7379044890403748),
 ('ورایحه', 0.7376339435577393)]

In [None]:
import numpy as np


In [None]:
# Create sentence vectors by averaging word vectors
def sentence_vector(sentence):
    vectors = []
    for word in sentence:
        try:
            vectors.append(model.wv[word])
        except KeyError:
            # Handle words not in vocabulary (e.g., use a zero vector)
            vectors.append(np.zeros(100))  # Assuming vector_size=100
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)


In [None]:
sentence_vectors = train_data['preprocess'].apply(sentence_vector)
sentence_vectors

Unnamed: 0,preprocess
0,"[0.005694287, 0.008681965, 0.003570844, 0.0045..."
1,"[-0.44634414, 0.5430974, -0.16870365, -0.62422..."
2,"[-0.93686, 0.31054696, 0.38784483, -0.13335012..."
3,"[-0.88345385, 0.4296946, 0.7630957, -0.3176209..."
4,"[-1.4882497, 0.2310642, -0.033399116, -0.45384..."
...,...
149395,"[-0.2964283, -0.019203315, 0.3061406, -0.09028..."
149396,"[-0.7486878, 0.1474654, 0.10103553, -0.2724710..."
149397,"[-0.6200192, 0.25292456, -0.10406545, -0.07488..."
149398,"[-0.77534854, 0.4534001, 0.3839673, -0.2479527..."


In [None]:
from sklearn.model_selection import train_test_split

# Convert sentence vectors to a NumPy array
X = np.array(sentence_vectors.to_list())

# Assuming 'df["recommendation_status"]' contains target labels
y = train_data["recommendation_status"].values

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:

from sklearn.linear_model import LogisticRegression

# Initialize and train the Logistic Regression model
logistic_model = LogisticRegression(max_iter=1000)
logistic_model.fit(X_train, y_train)


In [None]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = logistic_model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

Accuracy: 0.6597389558232932


In [None]:
def predict_recommendation(comment):
    preprocessed_comment = preprocess_text(comment)
    sentence_vector_comment = sentence_vector(preprocessed_comment)
    X_comment = np.array([sentence_vector_comment])
    prediction = logistic_model.predict(X_comment)
    if prediction[0] == 2:
        return "no_idea"
    elif prediction[0] == 1:
        return "recommended"
    else:
        return "not_recommended"

In [None]:
new_comment = 'عجیب به نظر میاد'

In [None]:
predict_recommendation(new_comment)

'no_idea'

<p dir=rtl style="direction: rtl;text-align: justify;line-height:200%;font-family:vazir;font-size:medium">
  <font face="vazir" size=3>
    انجام عملیات پیش‌بینی نظرهای فایل test. در این مرحله تابع پیاده‌سازی شده را برای داده‌های داخل فایل تست اعمال کنید و در یک دیتا فریم به نام submission ذخیره کنید. این دیتافریم می‌بایست به شکل زیر باشد که مشخص کنند کلاس هرکدام از نظرها می‌باشد.(این فایل به صورت مستقیم در ارزیابی پروژه شما تاثیر دارد)
  </font>
</p>
<table class="center">
  <tr>
    <th>class</th>
  </tr>
  <tr>
    <td>not_recommended</td>
  </tr>
  <tr>
    <td>not_recommended</td>
  </tr>
  <tr>
    <td>recommended</td>
  </tr>
  <tr>
    <td>...</td>
  </tr>
</table>

In [None]:
pre = test_data['body'].apply(predict_recommendation)
submission = pd.DataFrame({'class':pre})
submission

Unnamed: 0,class
0,recommended
1,not_recommended
2,no_idea
3,recommended
4,recommended
...,...
595,recommended
596,recommended
597,not_recommended
598,no_idea


In [None]:
import zipfile
import joblib

def compress(file_names):
    print("File Paths:")
    print(file_names)
    compression = zipfile.ZIP_DEFLATED
    with zipfile.ZipFile("result.zip", mode="w") as zf:
        for file_name in file_names:
            zf.write('./' + file_name, file_name, compress_type=compression)

submission.to_csv('submission.csv', index=False)

file_names = [ 'submission.csv']
compress(file_names)

File Paths:
['submission.csv']
