In [None]:
import tensorflow as tf

In [None]:
!wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/datasetb2d9982.zip

In [None]:
!unzip datasetb2d9982.zip

In [None]:
import pandas as pd
train_df = pd.read_csv("dataset/train.csv")

In [None]:
!pip install scikit-learn

import time
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix, hstack
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer

import warnings
warnings.filterwarnings('ignore')

import gc


In [None]:
train_df.astype('object').describe().transpose()

In [None]:
from string import punctuation

In [None]:
punctuation_symbols = []
for symbol in punctuation:
    punctuation_symbols.append((symbol, ''))

In [None]:
!pip install nltk
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop = stopwords.words('english')

In [None]:
import string
def remove_punctuation(sentence: str) -> str:
   return sentence.translate(str.maketrans('', '', string.punctuation))

def remove_stop_words(x):
   x = ' '.join([i for i in x.lower().split(' ') if i not in stop])
   return x

def to_lower(x):
   return x.lower()

In [None]:
df_train_reduced = train_df.fillna('')
df_train_reduced.shape

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
df_train_reduced['DESCRIPTION'] = df_train_reduced['DESCRIPTION'].apply(porter.stem)

df_train_reduced['DESCRIPTION'] = df_train_reduced['DESCRIPTION'].apply(remove_punctuation)
df_train_reduced['DESCRIPTION'] = df_train_reduced['DESCRIPTION'].apply(remove_stop_words)
df_train_reduced['DESCRIPTION'] = df_train_reduced['DESCRIPTION'].apply(to_lower)


df_train_reduced['TITLE'] = df_train_reduced['TITLE'].apply(remove_punctuation)
df_train_reduced['TITLE'] = df_train_reduced['TITLE'].apply(remove_stop_words)
df_train_reduced['TITLE'] = df_train_reduced['TITLE'].apply(to_lower)

df_train_reduced['BULLET_POINTS'] = df_train_reduced['BULLET_POINTS'].apply(porter.stem)
df_train_reduced['BULLET_POINTS'] = df_train_reduced['BULLET_POINTS'].apply(remove_punctuation)
df_train_reduced['BULLET_POINTS'] = df_train_reduced['BULLET_POINTS'].apply(remove_stop_words)
df_train_reduced['BULLET_POINTS'] = df_train_reduced['BULLET_POINTS'].apply(to_lower)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer as cv, TfidfVectorizer

In [None]:
from collections import Counter

def reducecolumns(Col):
    n_docs = Counter(Col.nonzero()[1])
    cols_to_keep = [k for k, v in n_docs.items() if v > .001 * Col.shape[0]]
    return Col[:, cols_to_keep]

In [None]:
cv = CountVectorizer(min_df=10)
X_name = cv.fit_transform(df_train_reduced['TITLE'])

In [None]:
tv = TfidfVectorizer(max_features=50000, ngram_range=(1, 2), stop_words='english')
X_description = tv.fit_transform(df_train_reduced['DESCRIPTION'])
X_bullet = tv.fit_transform(df_train_reduced['BULLET_POINTS'])

In [None]:
print("Title points Shape: " + str(reducecolumns(X_name).shape))
print("Item Description Shape: " + str(reducecolumns(X_description).shape))
print("Bullet points Shape: " + str(reducecolumns(X_bullet).shape))

In [None]:
tfidf_weights = dict(zip(tv.get_feature_names_out(), tv.idf_))
tfidf_weights = pd.DataFrame(columns = ['tfidf_weights']).from_dict(dict(tfidf_weights), orient = 'index')
tfidf_weights.columns = ['tfidf_weights']

tfidf_weights.sort_values(by=['tfidf_weights'], ascending=True)

In [None]:
sparse_merge = hstack((reducecolumns(X_description), reducecolumns(X_bullet), reducecolumns(X_name))).tocsr()
print(sparse_merge.shape)

In [None]:
target = np.log(df_train_reduced['PRODUCT_LENGTH']+1)
print(target.shape)

In [None]:
from sklearn.model_selection import train_test_split
features_train, features_test, target_train, target_test = train_test_split(sparse_merge, target, test_size = .35, random_state = 1)


In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasRegressor

In [None]:
def regressor_model(): 
    model = Sequential()
    model.add(Dense(units = 7 ,kernel_initializer = 'uniform', activation = 'relu', input_dim = sparse_merge.shape[1]))
    model.add(Dense(1, kernel_initializer = 'uniform'))
    model.compile(loss='mean_squared_error', optimizer='adam', metrics=['mae'])
    return model

In [None]:
model = KerasRegressor(build_fn= regressor_model, verbose=1)


In [None]:
model.fit(features_train.toarray(),target_train)  

In [None]:
target_pred = model.predict(features_test.toarray())

In [None]:
features_test.toarray().shape

In [None]:
target_train

In [None]:
model.model.save('saved_model.h5')

In [None]:
target_pred

In [None]:
np.expm1(target_pred)

In [None]:
from sklearn import metrics

In [None]:
score = max( 0 , 100*(1-metrics.mean_absolute_percentage_error(target_test,target_pred)))
print(score)

TEST

In [None]:
test_df = pd.read_csv("dataset/test.csv")

In [None]:
df_test_reduced = test_df.fillna('')
df_test_reduced.shape

In [None]:
df_test_reduced['DESCRIPTION'] = df_test_reduced['DESCRIPTION'].apply(porter.stem)

df_test_reduced['DESCRIPTION'] = df_test_reduced['DESCRIPTION'].apply(remove_punctuation)
df_test_reduced['DESCRIPTION'] = df_test_reduced['DESCRIPTION'].apply(remove_stop_words)
df_test_reduced['DESCRIPTION'] = df_test_reduced['DESCRIPTION'].apply(to_lower)


df_test_reduced['TITLE'] = df_test_reduced['TITLE'].apply(remove_punctuation)
df_test_reduced['TITLE'] = df_test_reduced['TITLE'].apply(remove_stop_words)
df_test_reduced['TITLE'] = df_test_reduced['TITLE'].apply(to_lower)


df_test_reduced['BULLET_POINTS'] = df_test_reduced['BULLET_POINTS'].apply(porter.stem)

df_test_reduced['BULLET_POINTS'] = df_test_reduced['BULLET_POINTS'].apply(remove_punctuation)
df_test_reduced['BULLET_POINTS'] = df_test_reduced['BULLET_POINTS'].apply(remove_stop_words)
df_test_reduced['BULLET_POINTS'] = df_test_reduced['BULLET_POINTS'].apply(to_lower)

In [None]:
var = sparse_merge.shape[1]

In [None]:
tv = TfidfVectorizer(max_features = var//3, ngram_range=(1, 2), stop_words='english')
X_description = tv.fit_transform(df_test_reduced['DESCRIPTION'])
X_bullet = tv.fit_transform(df_test_reduced['BULLET_POINTS'])

In [None]:
cv = CountVectorizer(min_df=10, max_features=(var-(2*var//3))
X_name = cv.fit_transform(df_test_reduced['TITLE'])

In [None]:
print('Title shape:', (X_name).shape)
print('Description shape:', (X_description).shape)
print('Bullet points shape:', (X_bullet).shape)

In [None]:
tfidf_weights = dict(zip(tv.get_feature_names_out(), tv.idf_))
tfidf_weights = pd.DataFrame(columns = ['tfidf_weights']).from_dict(dict(tfidf_weights), orient = 'index')
tfidf_weights.columns = ['tfidf_weights']

tfidf_weights.sort_values(by=['tfidf_weights'], ascending=False).head(10)

In [None]:
sparse_merge_test = hstack((X_description, X_bullet, X_name)).tocsr()
print(sparse_merge_test.shape)

In [None]:
target_pred_test = model.predict(sparse_merge_test.toarray())

In [None]:
finaly = pd.DataFrame(np.expm1(target_pred_test), columns = ['PRODUCT_LENGTH'])
finaly['PRODUCT_ID'] = df_test_reduced['PRODUCT_ID']
finaly=finaly.set_index('PRODUCT_ID')

In [None]:
finaly.to_csv('entropy_preds.csv')