In [1]:
# Standard library imports
import gc
import math
import re

# Third-party imports
import lightgbm as lgb
import pandas as pd
import numpy as np
import seaborn as sns
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_squared_log_error, make_scorer
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, Lasso, Ridge, RidgeCV, BayesianRidge
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
from sklearn.utils import shuffle
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from scipy.sparse import csr_matrix
from scipy import sparse
import scipy

# Jupyter notebook magic command
%matplotlib inline

# Additional setup
stopwords1 = set(STOPWORDS)


In [2]:
def show_wordcloud(data, title=None):
    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwords1,
        max_words=200,
        max_font_size=40,
        scale=3,
        random_state=1
    ).generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')

    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)

    plt.imshow(wordcloud)
    plt.show()

In [3]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")
sample = pd.read_csv("Sample Submission.csv")

print(train.shape, test.shape, sample.shape)

(5429, 9) (2327, 8) (2327, 1)


In [None]:
dataset = pd.concat([train, test])
dataset.reset_index(drop=True)
dataset = dataset.sort_values(by=['session_id'])
dataset["client_agent_length"] = dataset["client_agent"].str.len()
dataset["client_agent_semicolon_count"] = dataset["client_agent"].str.count(";")
dataset["client_agent_version_count"] = dataset["client_agent"].str.count("/")
dataset["browser_version"] = dataset["client_agent"].str.split("/",expand=True)[1].str.split(" ",expand=True)[0]
dataset [["device","browser"]] = dataset["device_details"].str.split(pat=" - ",expand=True)

handheld = [int(device in ["iPhone","Android Phone","Android Tablet","iPad"]) for device in dataset ["device"]]
dataset ["hand_held"] = pd.Series(handheld)

desktop = [int(i == "Desktop") for i in dataset ["device"]]
dataset ["desktop"] = pd.Series(desktop)

others = [int(i in ["Other","Unknown"]) for i in dataset["device"]]
dataset["others"] = pd.Series(others)
dataset.head()


Unnamed: 0,session_id,session_number,client_agent,device_details,date,purchased,added_in_cart,checked_out,time_spent,client_agent_length,client_agent_semicolon_count,client_agent_version_count,browser_version,device,browser,hand_held
3696,0000ccfee64caa1d4ac5e2c025183bd5,77,Product/4.2.2 iPhone/7.1.2,iPhone - iOS,2019-10-07,1,0,0,175.537,26.0,0.0,2.0,4.2.2,iPhone,iOS,1
111,0001351371faa5a11668bd7e595d718b,33,Mozilla/5.0 (Windows NT 6.3; WOW64; Trident/7....,Desktop - IE,2020-02-28,1,0,0,849.612,68.0,3.0,2.0,5.0,Desktop,IE,1
2777,00061e9b19af901ed6172af75447e0fe,220,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,Desktop - Chrome,2020-02-22,0,0,0,40.83,102.0,0.0,4.0,5.0,Desktop,Chrome,1
504,0007570f57deb464f48e062d9751cb95,3344,Product/4.7.0 iPhone/8.1,iPhone - iOS,2019-12-26,0,1,0,,24.0,0.0,2.0,4.7.0,iPhone,iOS,0
5204,0007799644b7b1eec55a6390deda8065,143,Product/4.1.1 iPhone/7.1.2,iPhone - iOS,2019-08-11,0,0,0,1046.787,26.0,0.0,2.0,4.1.1,iPhone,iOS,0
