In [None]:
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

In [None]:
df = pd.read_csv("data/vehicles.csv")

In [None]:
df

In [None]:
df.describe()

First, we drop the rows with **price = -1**:

In [None]:
df = df[df.price != -1]

In [None]:
df

In [None]:
df.dtypes

### Replacing NaN values:

In [None]:
for col in list(df.columns):
    print(col,"has {0} NaN values".format(df[col].isna().sum()))

We have NaN values in the following columns:
1. brand
2. mileage
3. year

In [None]:
nan_cols = ["brand","mileage", "year"]
print(df[nan_cols].mode())

In [None]:
for col in nan_cols:
    print("Column {} has {} NaN values".format(col, df[col].isna().sum()))

In [None]:
df.loc[:,"brand"].fillna("پراید صندوق‌دار::Pride", inplace = True)
df.loc[:,"mileage"].fillna("200000.0", inplace = True)
df.loc[:,"year"].fillna("1393", inplace = True)

In [None]:
for col in nan_cols:
    print("Column {} has {} NaN values".format(col, df[col].isna().sum()))


Now we convert the categorial values:
1. brand
2. category
3. created_at
4. description
5. title
6. year

to numerical values so we can use the **mutual_info_regression()** method which only works with numerical values:

In [None]:
df['year'].value_counts()

In [None]:
df["year"].replace({"<1366": "1366"}, inplace=True)

df['year'] = df['year'].astype(int)

In [None]:
from sklearn.preprocessing import OrdinalEncoder
pd.options.mode.chained_assignment = None  # default='warn'

numerical_cols = list(df._get_numeric_data().columns)
categorical_cols = [x for x in list(df.columns) if x not in numerical_cols]


categorical_cols.remove('description')
categorical_cols.remove('title')
print(categorical_cols)

ord_enc = OrdinalEncoder()

df[categorical_cols] = ord_enc.fit_transform(df[categorical_cols].astype(str))

### Converting all floats to int:

In [None]:
df['brand'] = df['brand'].astype(int)
df['category'] = df['category'].astype(int)
df['created_at'] = df['created_at'].astype(int)
df['category'] = df['category'].astype(int)
df['mileage']= df['mileage'].astype(int)

In [None]:
df.head(10)

In [None]:
df.dtypes

Now we drop the categorical columns which we don't want to include in our **mutual info** calculation. 

In [None]:
y = df['price']

X = df

X = X.drop(columns=['description', 'title', 'price'], axis = 1)

In [None]:
X.head(10)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import mutual_info_regression

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.4, random_state = 0)
mi = mutual_info_regression(X_train, y_train)
mi = pd.Series(mi)
mi.index = X_train.columns
mi.sort_values(ascending = False, inplace = True)

mi

In [None]:
import matplotlib.pyplot as plt

plt.title('Mutual information with respect to features')
mi.plot.bar()
plt.show()

## Words Exctraction:

In [None]:
with open('stop_words.txt') as f:
    ignore = [line.rstrip() for line in f]

ignore = [x.strip() for x in ignore]
print("Some stop words:")
print(ignore[30:45])

In [None]:
# import hazm
# from collections import Counter

# words_cnt = Counter()

# for sentence in df['description']:
#     words = hazm.word_tokenize(sentence)
#     for w in words:
#         if w in ignore:
#             continue
        
#         words_cnt[w] = words_cnt.get(w, 0) + 1

In [None]:
# import hazm
# from collections import Counter

# words_cnt = Counter()

# for sentence in df['title']:
#     words = hazm.word_tokenize(sentence)
#     for w in words:
#         if w in ignore:
#             continue
        
#         words_cnt[w] = words_cnt.get(w, 0) + 1

In [None]:
# most_common_words = words_cnt.most_common(50)
# for word in most_common_words[:50]:
#     print(word)

## One Hot Encoding:

We use one hot encoding for the following features:

1. Brand
2. Heavy/Light category
3. 10 most common words

In [None]:
brand_one_hot = pd.get_dummies(df['brand'], prefix='brand')
df = df.drop(['brand'], axis = 1)
df = pd.concat([df, brand_one_hot], axis = 1)

In [None]:
cat_one_hot = pd.get_dummies(df['category'], prefix = 'cat')
df = df.drop(['category'], axis = 1)
df = pd.concat([df, cat_one_hot], axis = 1)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
import hazm

ndf = df.drop(['description', 'title'], axis = 1)
ndf = ndf.reset_index()


words = ['دوگانه', 'روکش', 'تخفیف', 'نو', 'رنگ', 'وانت', 'مدادی', 'سالم', 'بیمه', 'سفید']

cnt_vec = CountVectorizer(analyzer = 'word', tokenizer = hazm.word_tokenize)

fit = cnt_vec.fit_transform(df['description'])
indices = [cnt_vec.vocabulary_[word] for word in words]
sel_fit = fit[:, indices].toarray()
print(sel_fit.sum(axis = 0))
ndf[words] = pd.DataFrame(sel_fit, columns = words)


display(ndf)

## Normalization:

In [None]:
X = ndf.drop(['price', 'index'], axis = 1)
y = ndf['price']

In [None]:
def normalize(df):
    result = df.copy()
    for feature_name in df.columns:
        max_value = df[feature_name].max()
        min_value = df[feature_name].min()
        result[feature_name] = (df[feature_name] - min_value) / (max_value - min_value)
    return result

X = normalize(X)

In [None]:
X

In [None]:
y

## Testing

In [None]:
from sklearn.metrics import mean_squared_error
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [None]:
from sklearn.neighbors import KNeighborsRegressor

KNN = KNeighborsRegressor(n_neighbors = 200)
KNN.fit(X_train, y_train)
price_KNN = KNN.predict(X_test)

In [None]:

print('RMSE:', mean_squared_error(price_KNN, y_test, squared = False))
print('MSE:', mean_squared_error(price_KNN, y_test, squared = True))