In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

In [None]:
from sklearn.linear_model import Ridge
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from scipy.sparse import hstack

In [None]:
train_df = pd.read_csv('../input/train.tsv', delimiter='\t')
test_df = pd.read_csv('../input/test.tsv', delimiter='\t')

In [None]:
train_test_border = len(train_df)

In [None]:
train_test_df = pd.concat([train_df.drop(['train_id', 'price'], axis=1), test_df.drop(['test_id'], axis=1)], axis=0)

In [None]:
train_test_df.isnull().sum()

In [None]:
train_test_df.category_name = train_test_df.category_name.fillna('Nan/Nan/Nan')
train_test_df.brand_name = train_test_df.brand_name.fillna('Nan')
train_test_df.item_description = train_test_df.item_description.fillna('No description yet')

In [None]:
train_test_df.isnull().sum()

In [None]:
split_category = np.array([x for x in train_test_df.category_name.str.split('/', 2).values])

In [None]:
train_test_df['split_category_0'] = split_category[:,0]
train_test_df['split_category_1'] = split_category[:,1]
train_test_df['split_category_2'] = split_category[:,2]
train_test_df['brand_category_2'] = train_test_df.split_category_2.values + '/' + train_test_df.brand_name.values

In [None]:
train_test_df = train_test_df.drop(['category_name'], axis=1)

In [None]:
train_test_df.split_category_0.value_counts()

## Women

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Women']

In [None]:
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values

In [None]:
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

In [None]:
category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

In [None]:
category_df.head()

In [None]:
encoder = OneHotEncoder()

In [None]:
X_data = encoder.fit_transform(category_df)

In [None]:
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()

In [None]:
train_df.category_name = train_df.category_name.fillna('Nan/Nan/Nan')
train_df['split_category_0'] =  np.array([x for x in train_df.category_name.str.split('/', 2).values])[:,0]
train_df.split_category_0.value_counts()

In [None]:
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Women'].price)

In [None]:
category_index = category_df.index

In [None]:
train_women = train_df.split_category_0.value_counts()['Women']

In [None]:
submit_index = category_index[train_women:]

In [None]:
X_train = X_data[:train_women]
X_submit = X_data[train_women:]

In [None]:
submit_index.shape

In [None]:
X_submit.shape

In [None]:
ridge = Ridge(alpha=3.1)

In [None]:
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
submit_df = pd.DataFrame(submit_index)
submit_df = pd.concat([submit_df, pd.DataFrame(price)], axis=1)
submit_df.columns = ['test_id', 'price']
submit_df.head()

## Beauty

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Beauty']

In [None]:
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values

In [None]:
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

In [None]:
category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

In [None]:
category_df.head()

In [None]:
encoder = OneHotEncoder()

In [None]:
X_data = encoder.fit_transform(category_df)

In [None]:
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()

In [None]:
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Beauty'].price)

In [None]:
category_index = category_df.index

In [None]:
train_category = train_df.split_category_0.value_counts()['Beauty']

In [None]:
submit_index = category_index[train_category:]

In [None]:
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
submit_index.shape

In [None]:
X_submit.shape

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df

## Kids

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Kids']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

In [None]:
category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Kids'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Kids']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
submit_index.shape

In [None]:
X_submit.shape

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df

## Electronics

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Electronics']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

In [None]:
category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Electronics'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Electronics']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df

## Men

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Men']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

In [None]:
category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Men'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Men']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df

## Home

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Home']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Home'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Home']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df

## Vintage & Collectibles

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Vintage & Collectibles']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Vintage & Collectibles'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Vintage & Collectibles']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df

## Other

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Other']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Other'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Other']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df.shape

## Handmade

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Handmade']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Handmade'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Handmade']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df.shape

## Sports & Outdoors

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Sports & Outdoors']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Sports & Outdoors'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Sports & Outdoors']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df.shape

## Nan

In [None]:
category_df = train_test_df[train_test_df.split_category_0 == 'Nan']
category_df.head()

In [None]:
name = category_df.name.values
item_desc = category_df.item_description.values
vectorizer = TfidfVectorizer(max_df=0.85, min_df=2, ngram_range=(1,2), max_features=100000)
name_vecs = vectorizer.fit_transform(name)
item_desc_vecs = vectorizer.fit_transform(item_desc)

In [None]:
category_df = category_df.drop(['name', 'item_description', 'split_category_0'], axis=1)

category_df.split_category_1 = category_df.split_category_1.astype('category')
category_df.split_category_2 = category_df.split_category_2.astype('category')
category_df.brand_name = category_df.brand_name.astype('category')
category_df.brand_category_2 = category_df.brand_category_2.astype('category')

category_df.split_category_1 = category_df.split_category_1.cat.codes
category_df.split_category_2 = category_df.split_category_2.cat.codes
category_df.brand_name = category_df.brand_name.cat.codes
category_df.brand_category_2 = category_df.brand_category_2.cat.codes

category_df.head()

In [None]:
encoder = OneHotEncoder()
X_data = encoder.fit_transform(category_df)
X_data = hstack([X_data, name_vecs, item_desc_vecs]).tocsr()
X_data = np.log1p(X_data)
y_train = np.log1p(train_df[train_df.split_category_0 == 'Nan'].price)
category_index = category_df.index
train_category = train_df.split_category_0.value_counts()['Nan']
submit_index = category_index[train_category:]
X_train = X_data[:train_category]
X_submit = X_data[train_category:]

In [None]:
ridge = Ridge(alpha=3.1)
ridge.fit(X_train, y_train)

In [None]:
price = np.expm1(ridge.predict(X_submit))

In [None]:
tmp_df = pd.DataFrame(submit_index)
tmp_df = pd.concat([tmp_df, pd.DataFrame(price)], axis=1)
tmp_df.columns = ['test_id', 'price']
submit_df = pd.concat([submit_df, tmp_df], axis=0)
submit_df.shape

In [None]:
submit_df = submit_df.sort_values(by=['test_id'], ascending=True)

In [None]:
submit_df.to_csv('submission.csv', index=False)