## Kaggle - Facebook recruiting


### Background


Facebook has simulated a dataset, consisting of an artificial world consisting of more than 100,000 places located in a 10 km by 10 km square.


## Analysis


lets load needed libraries and global notebook settings

In [None]:
import pandas as pd
import numpy as np
import pylab as plt
import scipy.signal
import warnings
import seaborn as sns 
from matplotlib import animation,cm

%matplotlib inline
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (15.0, 15.0)
current_palette = sns.color_palette()

## Loading data

We receive 2 huge files from Facebook (test.csv, train.csv).
We will use pd.read_csv to load data to data frame.
And create another dataframe aggrigated using place_id

In [None]:
df_train = pd.read_csv('../input/train.csv')
df_test = pd.read_csv('../input/test.csv')

## lets get a feeling of the data

In [None]:
df_train.head()

## lets start exploring the variables

### Time Column

lets check the distribution of time column on both test and train

In [None]:
plt.figure(1, figsize=(12,3))

def plot_time_hist(df,color,title,subplot):
    counts, bins = np.histogram(df["time"], bins=50)
    binsc = bins[:-1] + np.diff(bins)/2.
    plt.subplot(subplot)
    plt.bar(binsc, counts/(counts.sum()*1.0), width=np.diff(bins)[0], color=color)
    
    plt.grid(True)
    plt.xlabel("Time")
    plt.ylabel("Fraction")
    plt.title(title)
    return counts, bins, binsc


counts1, bins1, binsc1 = plot_time_hist(df_train, current_palette[0], 'Train', 121)
counts2, bins2, binsc2 = plot_time_hist(df_test, current_palette[1], 'Test', 122)
plt.show()


 **Insigts:**
 
    1. Its easy to see that the test is continue of the train dataset
    2. Its hard to see any pattern in the time data possible reasons:
       -  diffrent users in diffrent timezones
       -  no pattern in the data at all
 
 
 **lets validate 1 (the test is continue of the train dataset) by ploting the train and test side by side.**


In [None]:
plt.figure(2, figsize=(12,3))
plt.bar(binsc1, counts1/(counts1.sum()*1.0), width=np.diff(bins1)[0], color=current_palette[0], label="Train")
plt.bar(binsc2, counts2/(counts2.sum()*1.0), width=np.diff(bins2)[0], color=current_palette[1], label="Test")
plt.grid(True)
plt.xlabel("Time")
plt.ylabel("Fraction")
plt.title("Test")
plt.legend() # add labels for each 'entity'
plt.show()

**lets validate 2 (if diffrent users has diffrent timezones) by checkingh frequencies for aggregated places**


In [None]:
def get_n_place_time_dict(df,n):
	places_by_frequency = df_train.groupby('place_id')['place_id'].agg('count').sort_values(ascending=False).index.tolist()
	places_by_frequency = places_by_frequency[:n]
	place_time_dict = {place_id: np.squeeze(df_train[df_train['place_id']==place_id].as_matrix(columns=['time'])) 
                   		for place_id in places_by_frequency}
	return place_time_dict 

def get_all_autocorrs(place_time_dict,n):

	#create historgram settings (One bin per 4.0 time units)
	hist_range = (-100000.0, 100000.0)
	n_bins = 50000 
	all_autocorrs = np.zeros((n_bins, n))

	# Get the autocorrelation between timestamps for each place
	place_n = 0
	for place_id,times in place_time_dict.items():
  		n_events = times.size
  		n_samples = n_events*n_events # We are still randomly choosing timestamps, but this should give good coverage
  		hist_vals, bin_edges = np.histogram(np.random.choice(times, size=n_samples, replace=True) - \
                                    				         np.random.choice(times, size=n_samples, replace=True), 
						                    bins=n_bins,
                                            range=hist_range)
	  	all_autocorrs[:, place_n] = hist_vals
  		place_n += 1
	return all_autocorrs 

def plot_fft_of_autocorrelation(f, psd,magnitude):
    fig, axs = plt.subplots(magnitude,1)

    # Adjust the X axis to be in time points instead of 1/F
    f /= 4.0 # Remember that there is one bin per 4.0 time units
    f = 1.0/f # Go back to time points
 
    for i in range(magnitude):
        axs[i].plot(f, np.log(psd))
        axs[i].set_title('Log FFT of autocorrelations for each place')
        axs[i].set_xlabel('time units')
        axs[i].set_xlim([0, 2500+97500*i])
        axs[i].set_xticks(np.arange(0, 2500+97500*i, 100+3900*i))
        axs[i].grid(True)

    fig.tight_layout()
    plt.show()    

def investigate_fft_of_autocorrelation(df,n_most=100,magnitude=2):
    place_time_dict = get_n_place_time_dict(df_train,n_most)
    all_autocorrs = get_all_autocorrs(place_time_dict,n_most)
    f, psd = scipy.signal.welch(all_autocorrs, nperseg=25000, noverlap=20000, return_onesided=True, axis=0)
    plot_fft_of_autocorrelation(f, psd, magnitude)

investigate_fft_of_autocorrelation(df_train)


**Insigts:**
 
    1.  Its easy to see that there is correlation once we grouped by place_id which probably indicate:
        - diffrent timezone
        - places checkedin in diffrent times (school mostly in the morning and pubs mostly at night)
 
    2. We can conclude the time unit from training set,We can see the peaks at 1440 ,1700 and 10000 
       This result confirms that the time units are in minutes, because:
       -  The peak in 1440 which is near the number of minutes in a day
       -  The peak in 10000 which is near the number of minutes in a week
       -  NOTE: couldnt figure why there a peak in 1700

**Lets validate the same time unit exists in the test set**

In [None]:
investigate_fft_of_autocorrelation(df_test)

 **Insigts:**
 
    1.  Its easy to see that the same time unit (minute) is on the test set as well

### X Y Columns

In [None]:
def plot_scatter_x_y(df,group_by):
    grouped = df[group_by].value_counts().reset_index()
    ids = grouped['index'][:10]
    colors = cm.rainbow(np.linspace(0, 1, len(ids)))

    plt.figure(figsize=(10,10))
    for id, c in zip(ids, colors):
        x = df[df[group_by] == id]['x']
        y = df[df[group_by] == id]['y']
        plt.scatter(x, y, color=c)
    
    plt.grid(True)
    plt.xlim(-0.1,10)
    plt.ylim(-0.1,10)
    plt.show()

plot_scatter_x_y(df_train,'place_id')

 **Insigts:**
 
    1.  Its easy to see that there is huge variation on the x axis and small variation on the y

### Accuracy Column

In [None]:
plt.figure(0, figsize=(12,12))

def plot_accuracy_by_week(df,color,label,subplot,method):
    df["week"] = np.ceil((df["time"]/(60*24*7)))
    df_wkaccuracy = df.groupby("week").agg({"accuracy":[np.mean, np.std]}).reset_index()
    df_wkaccuracy.columns = ["week", "mean", "std"]
    x = df_wkaccuracy["week"]
    ya = df_wkaccuracy[method]
    plt.subplot(subplot)
    plt.plot(x, ya, c=color, lw=3, label=label)
    plt.legend(loc=2)

plot_accuracy_by_week(df_train,current_palette[0],'Train',311,'std')
plot_accuracy_by_week(df_test,current_palette[1],'Test',312,'std')
plt.tight_layout()
plt.show()

plot_accuracy_by_week(df_train,current_palette[0],'Train',323,'mean')
plot_accuracy_by_week(df_test,current_palette[1],'Test',324,'mean')
plt.tight_layout()
plt.show()

In [None]:
 **Insigts:**
 
    1. Seems accuracy is increasing in train with time dramaticly
    2. Seems accuracy is decressing in test with time slowly  