In [1]:
import pandas as pd

import numpy as np
import itertools
import scipy.stats as stats

import os

import matplotlib as mpl
import matplotlib.pyplot as plt

import time 

# increase the maximum datapoints plotable
mpl.rcParams['agg.path.chunksize'] = 10000

# autofill on jupyter
%config IPCompleter.greedy=True

# ploting inline
%matplotlib inline

# help
#?str.split()

In [2]:
# exteranl insertion
# %load file.py

# !ls

In [3]:
cwd = os.getcwd()

### Ingesting

In [4]:
%%time

df_wishlist = pd.read_csv(cwd + "/01_Input/child_wishlist.csv", header=None)
df_goodkids = pd.read_csv(cwd + "/01_Input/gift_goodkids.csv", header=None)
df_sample_sub = pd.read_csv(cwd + "/01_Input/sample_submission_random.csv")

CPU times: user 1.53 s, sys: 174 ms, total: 1.7 s
Wall time: 1.8 s


### Exploratory

### child wishlist

In [5]:
df_wishlist.shape

(1000000, 11)

In [6]:
df_wishlist.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10
0,0,871,409,547,423,660,679,281,220,834,152
1,1,929,421,498,226,345,156,229,900,749,692


In [7]:
# colunm renamed
gift = ["g" + str(name) for name in range (10)]
gift.insert(0, "child")
df_wishlist.columns = gift

In [8]:
df_wishlist.head(2)

Unnamed: 0,child,g0,g1,g2,g3,g4,g5,g6,g7,g8,g9
0,0,871,409,547,423,660,679,281,220,834,152
1,1,929,421,498,226,345,156,229,900,749,692


In [9]:
# unique gifts
len(np.unique(df_wishlist.loc[:,"g1":"g10"].values))

1000

In [10]:
all_gifts = df_wishlist.loc[:,"g0":"g9"].values.tolist()
all_gifts_flat = list(itertools.chain(*all_gifts))

In [11]:
all_gifts_flat_ = pd.DataFrame(all_gifts_flat,columns=["g0"])

In [12]:
# check distribution
# fit = stats.norm.pdf(all_gifts_flat, np.mean(all_gifts_flat), np.std(all_gifts_flat))

# hmean = np.mean(all_gifts_flat)
# hstd = np.std(all_gifts_flat)
# pdf = stats.norm.pdf(all_gifts_flat, hmean, hstd)
# plt.plot(all_gifts_flat, pdf)

### gift good kids

In [13]:
df_goodkids.shape

(1000, 1001)

In [14]:
df_goodkids.head(2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,991,992,993,994,995,996,997,998,999,1000
0,0,1373,752341,100505,469875,854398,296716,564625,653466,783635,...,370958,727201,488828,191760,12036,101763,225121,257948,572060,876079
1,1,626294,218116,646840,15625,701320,71249,617497,341483,501812,...,271311,850930,189870,609739,575242,79309,642355,582890,573428,162350


In [15]:
# colunm renamed
priority = ["prior" + str(name) for name in range (0,1000)]
priority.insert(0, "gift")
df_goodkids.columns = priority

In [16]:
df_goodkids.head(2)

Unnamed: 0,gift,prior0,prior1,prior2,prior3,prior4,prior5,prior6,prior7,prior8,...,prior990,prior991,prior992,prior993,prior994,prior995,prior996,prior997,prior998,prior999
0,0,1373,752341,100505,469875,854398,296716,564625,653466,783635,...,370958,727201,488828,191760,12036,101763,225121,257948,572060,876079
1,1,626294,218116,646840,15625,701320,71249,617497,341483,501812,...,271311,850930,189870,609739,575242,79309,642355,582890,573428,162350


In [17]:
# unique good kids
len(np.unique(df_goodkids.loc[:,"prior1":"prior999"].values))

567508

In [18]:
# list with all kids
all_kids = df_goodkids.loc[:,"prior1":"prior999"].values.tolist()
all_kids_flat = list(itertools.chain(*all_kids))

In [19]:
# fit = stats.norm.pdf(all_kids_flat, np.mean(all_kids_flat), np.std(all_kids_flat))

# hmean = np.mean(all_kids_flat)
# hstd = np.std(all_kids_flat)
# pdf = stats.norm.pdf(all_kids_flat, hmean, hstd)
# plt.plot(all_kids_flat, pdf)

In [20]:
# Removing all the kids who aren't in the priority list from Santa, which means they can get their preferable 
# gift from the whislist

# Also removing the 4k first kids from the analyze, once they need to receive the same gift (twins)

## Matching Interest: (greedy)
* split data to remove twins
* remove the kids non present in the priority list (solved case: given then the preferable gift)
* map the kids who wants what the priority list name

In [21]:
# - T: twins
# - NT: non twins

# - P: in priority list
# - NP: not in priority list

In [22]:
# spliting wishlist into 
    # - twin
    # - non twin

df_wishlist_T = df_wishlist.iloc[:4000,:].copy()
df_wishlist_NT = df_wishlist.iloc[4000:,:].copy()

df_wishlist_NT.insert(0,'result', 0)
df_wishlist_T.insert(0,'result', 0)

# spliting kids into
    # - mapped by priority list
    # - non mapped by priorty list
    
all_kids = [kids for kids in range(1000000)]
kids_P = np.unique(df_goodkids.loc[:,"prior1":"prior999"].values).tolist()
kids_NP = list(set(all_kids) - set(kids_P))

kids_NP_NT = [kid for kid in kids_P if kid > 3999]

In [23]:
def getGift(df, mapped_kids, priority):
    print(level)
    df.loc[:,'result'] = df.loc[mapped_kids,:]["%s" % priority].astype(int)
    return df

In [24]:
df_goodkids.head(2)

Unnamed: 0,gift,prior0,prior1,prior2,prior3,prior4,prior5,prior6,prior7,prior8,...,prior990,prior991,prior992,prior993,prior994,prior995,prior996,prior997,prior998,prior999
0,0,1373,752341,100505,469875,854398,296716,564625,653466,783635,...,370958,727201,488828,191760,12036,101763,225121,257948,572060,876079
1,1,626294,218116,646840,15625,701320,71249,617497,341483,501812,...,271311,850930,189870,609739,575242,79309,642355,582890,573428,162350


In [25]:
df_wishlist_NT.head(2)

Unnamed: 0,result,child,g0,g1,g2,g3,g4,g5,g6,g7,g8,g9
4000,0,4000,700,440,350,840,780,604,81,197,88,504
4001,0,4001,656,507,498,471,964,913,343,541,169,419


In [None]:
# %%time
# for gift in range (10):
#     for priority in range (1000):
#         greedy_result = df_wishlist_NT['g%s' %gift].isin(df_goodkids['prior%s' %priority])
        
        
#         for child in greedy_result.index[greedy_result].tolist():
#             df_wishlist_NT.loc[child:child,'result'] = df_wishlist_NT.loc[child:child,'g%s' %gift]

## Max/Min-Cost Flow 