# Analysis

Technical Challenge for Data Science Candidates

Data checking and prototyping

In [None]:
import numpy as np
import pandas as pd
import math
import json

from os import path

import scipy.stats as st
import statsmodels as sm
import statsmodels.api as smi

from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler

import matplotlib
from cycler import cycler
import matplotlib.pyplot as plt

pd.__version__

## Development

There is a local module in a sub-dir providing "Utility"

In [2]:
# If you turn this feature on, you can display each result as it happens.
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
# this is the local Utility module

%load_ext autoreload

%reload_ext autoreload
%aimport wde
%autoreload 2

# unsure if aimport has to be called before or after usage.
from wde import Utility
%aimport wde

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [4]:
i0 = Utility.instance()

In [5]:
import unittest
import functools

class TestCase(unittest.TestCase):
    """
    To use test in prototyping
    """

    def test_simple_02(self):
        self.assertIsNotNone(i0)
        self.assertEqual(pd.__version__, pd.__version__)

    def test_simple_fail(self):
        self.assertNotEqual(pd.__version__, pd.__version__)

In [6]:
## To run the tests, use this
# Exit as true throws an exception.
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

.F
FAIL: test_simple_fail (__main__.TestCase)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-5-ab97d2588b1d>", line 14, in test_simple_fail
    self.assertNotEqual(pd.__version__, pd.__version__)
AssertionError: '0.24.2' == '0.24.2'

----------------------------------------------------------------------
Ran 2 tests in 0.011s

FAILED (failures=1)


<unittest.main.TestProgram at 0x7fa7d53459e8>

In [7]:
## Some example data.

df0 = pd.read_csv("flights.csv", sep=",")

In [8]:
df0.info()
df0.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227496 entries, 0 to 227495
Data columns (total 14 columns):
date         227496 non-null object
hour         224591 non-null float64
minute       224591 non-null float64
dep          224591 non-null float64
arr          224430 non-null float64
dep_delay    224591 non-null float64
arr_delay    223874 non-null float64
carrier      227496 non-null object
flight       227496 non-null int64
dest         227496 non-null object
plane        226701 non-null object
cancelled    227496 non-null int64
time         223874 non-null float64
dist         227496 non-null int64
dtypes: float64(7), int64(3), object(4)
memory usage: 24.3+ MB


Unnamed: 0,date,hour,minute,dep,arr,dep_delay,arr_delay,carrier,flight,dest,plane,cancelled,time,dist
0,2011-01-01 12:00:00,14.0,0.0,1400.0,1500.0,0.0,-10.0,AA,428,DFW,N576AA,0,40.0,224
1,2011-01-02 12:00:00,14.0,1.0,1401.0,1501.0,1.0,-9.0,AA,428,DFW,N557AA,0,45.0,224
2,2011-01-03 12:00:00,13.0,52.0,1352.0,1502.0,-8.0,-8.0,AA,428,DFW,N541AA,0,48.0,224
3,2011-01-04 12:00:00,14.0,3.0,1403.0,1513.0,3.0,3.0,AA,428,DFW,N403AA,0,39.0,224
4,2011-01-05 12:00:00,14.0,5.0,1405.0,1507.0,5.0,-3.0,AA,428,DFW,N492AA,0,44.0,224


In [9]:
## Some methods in the Utility
df1 = i0.str2cat(df0)
df1.info()
df1.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227496 entries, 0 to 227495
Data columns (total 14 columns):
date         227496 non-null category
hour         224591 non-null float64
minute       224591 non-null float64
dep          224591 non-null float64
arr          224430 non-null float64
dep_delay    224591 non-null float64
arr_delay    223874 non-null float64
carrier      227496 non-null category
flight       227496 non-null int64
dest         227496 non-null category
plane        226701 non-null category
cancelled    227496 non-null int64
time         223874 non-null float64
dist         227496 non-null int64
dtypes: category(4), float64(7), int64(3)
memory usage: 18.9 MB


Unnamed: 0,date,hour,minute,dep,arr,dep_delay,arr_delay,carrier,flight,dest,plane,cancelled,time,dist
0,2011-01-01 12:00:00,14.0,0.0,1400.0,1500.0,0.0,-10.0,AA,428,DFW,N576AA,0,40.0,224
1,2011-01-02 12:00:00,14.0,1.0,1401.0,1501.0,1.0,-9.0,AA,428,DFW,N557AA,0,45.0,224
2,2011-01-03 12:00:00,13.0,52.0,1352.0,1502.0,-8.0,-8.0,AA,428,DFW,N541AA,0,48.0,224
3,2011-01-04 12:00:00,14.0,3.0,1403.0,1513.0,3.0,3.0,AA,428,DFW,N403AA,0,39.0,224
4,2011-01-05 12:00:00,14.0,5.0,1405.0,1507.0,5.0,-3.0,AA,428,DFW,N492AA,0,44.0,224


In [10]:
## This encodes
# Just re-encoding, no shaping
df2 = i0.cat2code(df1)
df2.info()
df2.head()
df2.to_csv("coded.csv", index=False)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227496 entries, 0 to 227495
Data columns (total 14 columns):
date         227496 non-null int16
hour         224591 non-null float64
minute       224591 non-null float64
dep          224591 non-null float64
arr          224430 non-null float64
dep_delay    224591 non-null float64
arr_delay    223874 non-null float64
carrier      227496 non-null int8
flight       227496 non-null int64
dest         227496 non-null int8
plane        227496 non-null int16
cancelled    227496 non-null int64
time         223874 non-null float64
dist         227496 non-null int64
dtypes: float64(7), int16(2), int64(3), int8(2)
memory usage: 18.7 MB


Unnamed: 0,date,hour,minute,dep,arr,dep_delay,arr_delay,carrier,flight,dest,plane,cancelled,time,dist
0,0,14.0,0.0,1400.0,1500.0,0.0,-10.0,0,428,32,1761,0,40.0,224
1,1,14.0,1.0,1401.0,1501.0,1.0,-9.0,0,428,32,1701,0,45.0,224
2,2,13.0,52.0,1352.0,1502.0,-8.0,-8.0,0,428,32,1652,0,48.0,224
3,3,14.0,3.0,1403.0,1513.0,3.0,3.0,0,428,32,1088,0,39.0,224
4,4,14.0,5.0,1405.0,1507.0,5.0,-3.0,0,428,32,1377,0,44.0,224


In [11]:
## Many ways to check for null columns
# I use this like R

In [12]:
ds = i0.df2describe(df2)
# ds;
ds[ (ds['name'].str.startswith('date'))]

cts = ds[ (ds['q'].str.startswith('count'))]
cts['v'].value_counts()
# cts.value_counts

Unnamed: 0,name,q,v
0,date,count,227496.0
1,date,mean,181.628613
2,date,std,104.448661
3,date,min,0.0
4,date,25%,92.0
5,date,50%,182.0
6,date,75%,271.0
7,date,max,364.0


227496.0    7
224591.0    4
223874.0    2
224430.0    1
Name: v, dtype: int64

In [14]:
import unittest
import functools

class TestCase(unittest.TestCase):
    """
    Check some utilities for checking data loads.
    """

    def test_simple_02(self):
        cts = ds[ (ds['q'].str.startswith('count'))]
        i2 = len(set(cts['v'].value_counts().values))
        self.assertEqual(i2, 1, "classes of records with null")

    def test_simple_04(self):
        (mx0, cts) = i0.describe2null(ds)
        if any(cts.index != mx0):
            ps = 100 * (mx0 - cts.index)/cts.index
        print(ps)
        
    def test_simple_06(self):
        pass
        

    def test_simple_fail(self):
        self.assertNotEqual(pd.__version__, pd.__version__)

In [15]:
## To run the tests, use this
# Exit as true throws an exception.
if __name__ == '__main__':
    unittest.main(argv=['first-arg-is-ignored'], exit=False)

F..F

Float64Index([0.0, 1.2934623382058943, 1.6178743400305529, 1.3661275230584147], dtype='float64')



FAIL: test_simple_02 (__main__.TestCase)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-14-6d0bbe9a4a10>", line 12, in test_simple_02
    self.assertEqual(i2, 1, "classes of records with null")
AssertionError: 4 != 1 : classes of records with null

FAIL: test_simple_fail (__main__.TestCase)
----------------------------------------------------------------------
Traceback (most recent call last):
  File "<ipython-input-14-6d0bbe9a4a10>", line 25, in test_simple_fail
    self.assertNotEqual(pd.__version__, pd.__version__)
AssertionError: '0.24.2' == '0.24.2'

----------------------------------------------------------------------
Ran 4 tests in 0.023s

FAILED (failures=2)


<unittest.main.TestProgram at 0x7fa7d47e7dd8>

In [40]:
v0 = i0.describe2null(ds)
print(v0)

(227496.0, 227496.0    7
224591.0    4
223874.0    2
224430.0    1
Name: v, dtype: int64)


In [43]:
## Checking for Records with NaN/null

In [41]:
from wde import Utility
# cts['v'].value_counts()
(mx0, cts) = i0.describe2null(ds)

In [45]:
if any(cts.index != mx0):
    100 * (mx0 - cts.index)/cts.index

Float64Index([0.0, 1.2934623382058943, 1.6178743400305529, 1.3661275230584147], dtype='float64')

In [51]:
## a Big block of records - non-null

df2.head().apply(isna, axis=1)

AttributeError: 'DataFrame' object has no attribute 'map'

In [None]:
df0 = pd.read_csv("flights.csv", sep=",")

## Prototyping

In [39]:
prototype == 1

NameError: name 'prototype' is not defined

In [35]:
df2.shape[0]

227496

In [38]:
df3 = cts1.to_frame()
idx = df3[df3['v'] == mx0].index.values
idx[0]
idx1 = np.where(cts1.index.values == idx[0])
idx1[0][0]

NameError: name 'cts1' is not defined

In [None]:
df0 = pd.read_csv("flights.csv", sep=",")

In [None]:
list(df2.columns)
df2.info()

In [None]:
thresh0 = 0.7
thresh1 = thresh0 * (1 - thresh0)
nzv0 = i0.nzv(df2, thresh=thresh1)

In [None]:
ds[ds['name'].isin(nzv0)]

In [None]:
df3 = i0.code2scale(df2, scaler0=StandardScaler(with_std=False))
df3.head()
thresh0 = 0.7
thresh1 = thresh0 * (1 - thresh0)
nzv0 = i0.nzv(df3, thresh=thresh1)

ds = i0.df2describe(df3)
ds[ds['name'].isin(nzv0)]

In [None]:
# [ df3[x].describe() for x in df.columns ]
x0 = df3.age.describe()
x1 = df3.job.describe()
pd.concat([x0, x1])

In [None]:
ds = i0.df2describe(df3)
ds

In [None]:
df2.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

sns.set(style="white")

# Compute the correlation matrix
corr = df2.corr()

# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))

# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
df2.corr()['y'][:]

In [None]:
sns.pairplot(df2)

In [None]:
from matplotlib.ticker import AutoMinorLocator

fig = plt.figure(figsize=(18,6))

ax1 = sns.violinplot('marital', "y", data=df2, hue='modelLine')

ax1.minorticks_on()
ax1.xaxis.set_minor_locator(AutoMinorLocator(2))

ax1.grid(which='minor', axis='x', linewidth=1)

In [16]:
s0 = r"{'abc': 123, 'cde': 'abc'}"

In [17]:
s0

"{'abc': 123, 'cde': 'abc'}"

In [18]:
import json

In [23]:
d0 = eval(s0)
type(d0)

dict

In [28]:
pd.DataFrame.from_dict(s0)

ValueError: DataFrame constructor not properly called!

In [35]:
x = {'a': 1, 'b': 2, 'c': None, 'd': None}
y = {'b': 10, 'c': 11}
x.update(y)

In [36]:
x

{'a': 1, 'b': 10, 'c': 11, 'd': None}

In [33]:
y

{'b': 10, 'c': 11}

In [None]:
# input()

In [5]:
import array as arr

a = arr.array('H', [1.1, 3.5, 4.5])
print(a)
type(a)

TypeError: integer argument expected, got float

In [6]:
A = arr.array('H', [60,80,40])
B = arr.array('H', [2,3,5])

In [50]:
z0 = zip(A,B)
M = 5
X = 2
Y = 200

In [54]:
class Lift:
    'Lift state machine'
    x = 0
    y = 0
    X = None
    Y = None
    M = None
    
    stop_ = 0
    
    load_ = list()
    
    def __init__(self, X, Y, M):
        self.X = X
        self.Y = Y
        self.M = M
        
    def load(self, p0):
        if self.x + 1 <= self.X and self.y + p0[0] <= self.Y:
            self.load_.append(p0[1])
            return True
        
        return False
   
    def unload(self):
        self.stop_ = self.stop_ + len(set(self.load_))
        self.load_ = list()
        self.x = 0
        self.y = 0

In [55]:
l0 =Lift(2, 100, 5)

In [57]:
try:
    p0 = next(z0)
    while True:
        print(p0)
        if l0.load(p0):
            l0.unload()
            p0 = next(z0)
    
except:
    l0.unload()
    
print(l0.stop_)

2


In [26]:
def solution(A, B, M, X, Y):
    """
    Give a queue of lift passengers find the number of stops needed.
    
    @param A an integer array
    @param B an integer array
    """
    
    
    
    pass



In [28]:
l0=[1,2,2]

In [30]:
len(set(l0))

2