In [4]:
import pandas as pd 
from numpy import array
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding

import tensorflow as tf

In [5]:
train = pd.read_csv('ammi-bootcamp-kaggle-competition/train.csv', index_col='id')
test = pd.read_csv('ammi-bootcamp-kaggle-competition/test.csv', index_col='id')
submission = pd.read_csv('ammi-bootcamp-kaggle-competition/Sample_Submission.csv')
print(train.shape)
print(test.shape)
print(submission.shape)

(175000, 13)
(83210, 14)
(83210, 2)


In [8]:
# Function to calculate missing values by column
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

In [12]:
labels = train['price']
print(labels.shape)

(175000,)


In [13]:
train = train.drop(columns = ['taster_twitter_handle', 'taster_name', 'price'])
test = test.drop(columns = ['index','taster_twitter_handle','taster_name', 'price'])
print(train.shape)
print(test.shape)

(175000, 10)
(83210, 10)


In [14]:
missing_values_table(train)

Your selected dataframe has 10 columns.
There are 7 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
region_2,99606,56.9
title,92811,53.0
designation,52266,29.9
region_1,28534,16.3
country,47,0.0
province,47,0.0
variety,1,0.0


In [15]:
train = train.drop(columns = ['region_2', 'title'])
test = test.drop(columns = ['region_2', 'title'])
print(train.shape)
print(test.shape)

(175000, 8)
(83210, 8)


In [16]:
train.head()

Unnamed: 0_level_0,country,description,designation,points,province,region_1,variety,winery
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
32027,Portugal,This is a fine rich balanced wine. It has ripe...,Vila Santa Reserva,88.870874,Alentejano,,PORTUGUESE RED,J. Portugal Ramos
71079,France,"A solid, chunky wine, with a structure that is...",,88.041695,Bordeaux,Lalande de Pomerol,BORDEAUX-STYLE RED BLEND,Château Tour Grand Colombier
32440,France,"This is powerful and concentrated, with the hi...",,94.085021,Bordeaux,Saint-Émilion,BORDEAUX-STYLE RED BLEND,Château Figeac
124405,US,"Rich, ripe and oaky, this Petite Sirah charms ...",Thompson Vineyard,89.869797,California,Santa Barbara County,PETITE SIRAH,Jaffurs
33649,US,This wine is a unique in the state blend and f...,McKinley Springs Vineyard,89.017651,Washington,Horse Heaven Hills,ROSé,Syncline


In [17]:
# Data Imputation
for column in list(train.columns):
    train[str(column)].fillna(train[str(column)].mode()[0], inplace=True)
    test[str(column)].fillna(test[str(column)].mode()[0], inplace=True)

In [None]:
from nltk.tokenize import word_tokenize
train.columns = train.columns.astype(str)
test.columns = test.columns.astype(str)
vocab_size = 20000
word_count = lambda sentence: len(word_tokenize(sentence))
dic = {}
DICT = {}
for column in  train.drop(columns = ['points']).columns:
    column_name = column
    longest_sentence = max(train[str(column)], key=word_count)
    length_long_sentence = len(word_tokenize(longest_sentence))
    column_train = [one_hot(sent, vocab_size) for sent in train[str(column)]]
    column_test = [one_hot(sent, vocab_size) for sent in test[str(column)]]
    dic[str(column_name)] = pad_sequences(column_train, length_long_sentence, padding='post')
    DICT[str(column_name)] = pad_sequences(column_test, length_long_sentence, padding='post')

In [83]:
dic['country']

array([[11307,     0,     0],
       [13441,     0,     0],
       [13441,     0,     0],
       ...,
       [ 8688,     0,     0],
       [ 3895,     0,     0],
       [ 3895,     0,     0]], dtype=int32)

In [37]:
import numpy as np

TRAIN = np.concatenate((dic['country'], dic['description'], dic['designation'], dic['province'], dic['region_1'], dic['variety'], dic['winery'], train[['points']].to_numpy()), axis=1)
TEST = np.concatenate((DICT['country'], DICT['description'], DICT['designation'], DICT['province'], DICT['region_1'], DICT['variety'], DICT['winery'], test[['points']].to_numpy()), axis=1)
print(TRAIN.shape)
print(TEST.shape)

(175000, 210)
(83210, 210)


In [71]:
from keras.models import Sequential
from keras import layers
from keras.callbacks import ModelCheckpoint
# checkpoint
filepath="weights-improvement-{epoch:02d}-{val_accuracy:.2f}.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]
embedding_dim = 18
A_model = Sequential()
A_model.add(layers.Embedding(input_dim=20000, output_dim=embedding_dim, input_length=TRAIN.shape[1]))
A_model.add(layers.Flatten())
A_model.add(layers.Dense(13, activation='relu'))
# A_model.add(layers.Dense(8, activation='relu'))
A_model.add(layers.Dense(1, activation='linear'))
A_model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])
A_model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_11 (Embedding)     (None, 210, 18)           360000    
_________________________________________________________________
flatten_11 (Flatten)         (None, 3780)              0         
_________________________________________________________________
dense_29 (Dense)             (None, 13)                49153     
_________________________________________________________________
dense_30 (Dense)             (None, 1)                 14        
Total params: 409,167
Trainable params: 409,167
Non-trainable params: 0
_________________________________________________________________


In [72]:
A_model.fit(TRAIN, labels, epochs=30, batch_size = 16, validation_split=0.1, callbacks=callbacks_list, shuffle=True)

Train on 157500 samples, validate on 17500 samples
Epoch 1/30

Epoch 00001: val_accuracy improved from -inf to 0.04949, saving model to weights-improvement-01-0.05.hdf5
Epoch 2/30

Epoch 00002: val_accuracy improved from 0.04949 to 0.05377, saving model to weights-improvement-02-0.05.hdf5
Epoch 3/30

Epoch 00003: val_accuracy improved from 0.05377 to 0.06131, saving model to weights-improvement-03-0.06.hdf5
Epoch 4/30

Epoch 00004: val_accuracy did not improve from 0.06131
Epoch 5/30

Epoch 00005: val_accuracy improved from 0.06131 to 0.06251, saving model to weights-improvement-05-0.06.hdf5
Epoch 6/30

Epoch 00006: val_accuracy improved from 0.06251 to 0.07097, saving model to weights-improvement-06-0.07.hdf5
Epoch 7/30

Epoch 00007: val_accuracy did not improve from 0.07097
Epoch 8/30

Epoch 00008: val_accuracy did not improve from 0.07097
Epoch 9/30

Epoch 00009: val_accuracy improved from 0.07097 to 0.07211, saving model to weights-improvement-09-0.07.hdf5
Epoch 10/30

Epoch 00010:

<keras.callbacks.callbacks.History at 0x7f42b37a2e10>

In [73]:
# Please After Training Get the best model and best it below. 

A_model.load_weights("---------------------")
A_model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

In [74]:
predictions = A_model.predict(TEST)
predictions.shape

(83210, 1)

In [75]:
predictions

array([[126.06577 ],
       [ 37.875286],
       [ 52.801056],
       ...,
       [ 44.180225],
       [ 20.896353],
       [ 49.89761 ]], dtype=float32)

In [49]:
predictions

array([[135.25145 ],
       [ 39.39938 ],
       [ 50.85417 ],
       ...,
       [ 26.403248],
       [ 29.735802],
       [ 58.787224]], dtype=float32)

In [77]:
subm_df = pd.read_csv('ammi-bootcamp-kaggle-competition/Sample_Submission.csv')
subm_df.head()

Unnamed: 0,id,price
0,0,50
1,1,50
2,2,50
3,3,50
4,4,50


In [80]:
# Please type the 'submission' filename
subm_df['price'] = predictions
subm_df.to_csv('-----------------.csv', index=False)