In [207]:
import numpy as np
import pandas as pd
import collections

### Q1. Numpy Challenge
How would compute the row wise counts of all possible values in an array as in - compete the counts of unique values row-wise?

### Solution after refactoring
Use function in Numpy : np.bincount

In [208]:
def count_elements(arr):
    """
    Description: Function for row-wise counting unique values
    Input : given array
    Output : the same shape of array having counted unique values
    """
    # Grab the number of the rows for iteration
    num_rows = arr.shape[0] 
    # Use iteration to input each row into bincount function
    for i in range(num_rows): 
        # Change each row by bincount function and Slice for getting integer from 1 to 10
        arr[i,:] = np.bincount(arr[i,:], minlength=11)[1:11] 
    return arr

np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
result = count_elements(arr)
result

array([[1, 0, 2, 1, 1, 1, 0, 2, 2, 0],
       [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
       [0, 3, 0, 2, 3, 1, 0, 1, 0, 0],
       [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
       [2, 2, 2, 0, 0, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]])

### Before refactoring

In [209]:
# Input
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
arr

array([[ 9,  9,  4,  8,  8,  1,  5,  3,  6,  3],
       [ 3,  3,  2,  1,  9,  5,  1, 10,  7,  3],
       [ 5,  2,  6,  4,  5,  5,  4,  8,  2,  2],
       [ 8,  8,  1,  3, 10, 10,  4,  3,  6,  9],
       [ 2,  1,  8,  7,  3,  1,  9,  3,  6,  2],
       [ 9,  2,  6,  5,  3,  9,  4,  6,  1, 10]])

In [210]:
# Solution 1 (Best). Use function in Numpy : np.bincount
# Solution 2 (slow). Use nested loops

In [211]:
# %%timeit
# Solution 1 (fast). Use function in Numpy: np.bincount
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
num_rows = arr.shape[0] # Grab the number of the rows for iteration
for i in range(num_rows): # Use iteration to input each row into bincount function
    arr[i,:]= np.bincount(arr[i,:], minlength=11)[1:11] # Change each row by bincount function and Slice for getting integer from 1 to 10
arr

array([[1, 0, 2, 1, 1, 1, 0, 2, 2, 0],
       [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
       [0, 3, 0, 2, 3, 1, 0, 1, 0, 0],
       [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
       [2, 2, 2, 0, 0, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]])

In [212]:
# %%timeit
# Solution 2 (slow). Use nested loops
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
num_rows = arr.shape[0] # Grab the number of the rows for iteration
num_cols = arr.shape[1] # Grab the number of the columns for iteration

for i in range(num_rows): # Use iteration to grab each row 
    tmp = np.zeros(10) # Create zero numpy array
    for j in range(num_cols): # Use interation to grab each element in 'i'th row
        tmp[arr[i,j]-1] += 1 # Count and update 
    arr[i,:] = tmp # Overwrite on 'i'th row of arr
arr

array([[1, 0, 2, 1, 1, 1, 0, 2, 2, 0],
       [2, 1, 3, 0, 1, 0, 1, 0, 1, 1],
       [0, 3, 0, 2, 3, 1, 0, 1, 0, 0],
       [1, 0, 2, 1, 0, 1, 0, 2, 1, 2],
       [2, 2, 2, 0, 0, 1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1, 2, 0, 0, 2, 1]])

### Q2. Pandas Challenge
In df, Replace NaNs with ‘missing’ in columns 'Manufacturer', 'Model' and 'Type' and create a index as a combination of these three columns and check if the index is a primary key.

### Solution refactoring
The solution was simple, but if the question is solved with a long answer, I would do refactoring like below.

In [213]:
class Question_2():
    '''
    Description: Class for solving question 2
    ex) 
    q2 = question_2(path)
    result = q2.solve()
    '''
    def __init__(self, file_path):
        """
        Description: A constructor to pass path parameter to an object
        """
        self.file_path = file_path
    
    def solve(self):
        def __load_df(self):
            """
            Description: Load dataframe
            """
            self.df = pd.read_csv(self.file_path, usecols=[0,1,2,3,5])

        def __transform_df(self):
            """
             Description: Transform the loaded dataframe.
             1. Replace NaNs to "missing"
             2. Create index with values in 'Manufacturer', 'Model', 'Type' columns
            """
            self.df = self.df.fillna("missing")
            self.df = self.df.set_index(df1['Manufacturer']+"_"+df1['Model']+"_"+df1['Type'])

        __load_df(self)
        __transform_df(self)
        return self.df
        
path = 'https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv'
Q2 = Question_2(path)
result = Q2.solve()
result.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
Acura_Integra_Small,Acura,Integra,Small,12.9,18.8
missing_Legend_Midsize,missing,Legend,Midsize,29.2,38.7
Audi_90_Compact,Audi,90,Compact,25.9,32.3
Audi_100_Midsize,Audi,100,Midsize,missing,44.6
BMW_535i_Midsize,BMW,535i,Midsize,missing,missing


### Before refactoring

In [214]:
# Input
df =pd.read_csv('https://raw.githubusercontent.com/selva86/datasets/master/Cars93_miss.csv', usecols=[0,1,2,3,5])

In [215]:
df.head()
df.shape

(93, 5)

In [216]:
# Solution 1-1. Use dataframe method 'fillna'

In [217]:
# To check and see missing values in the part of the dataframe
np.where(pd.isnull(df))
df1 = df[df.isna().any(axis=1)]
df1.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
1,,Legend,Midsize,29.2,38.7
3,Audi,100,Midsize,,44.6
4,BMW,535i,Midsize,,
6,Buick,LeSabre,Large,19.9,
19,,Concorde,Large,18.4,18.4


In [218]:
df1 = df.fillna("missing")

In [219]:
# Check if there is missing value
print(np.where(pd.isnull(df)))
np.where(pd.isnull(df1))

(array([ 1,  3,  4,  4,  6, 19, 25, 29, 45, 47, 49, 59, 62, 72, 74, 82, 82,
       83, 86, 92]), array([0, 3, 3, 4, 4, 0, 3, 3, 2, 4, 0, 1, 4, 3, 2, 2, 4, 3, 3, 0]))


(array([], dtype=int64), array([], dtype=int64))

In [220]:
# To check empty value
column_list = df.columns
for i in column_list:
    print(df1[df1[i] == ''].index)

Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')
Int64Index([], dtype='int64')


In [221]:
df1.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
0,Acura,Integra,Small,12.9,18.8
1,missing,Legend,Midsize,29.2,38.7
2,Audi,90,Compact,25.9,32.3
3,Audi,100,Midsize,missing,44.6
4,BMW,535i,Midsize,missing,missing


In [222]:
# Solution 1-2. Use dataframe method fillna
column_list = df1.columns[0:3].tolist()
df2 = df1.set_index(df1['Manufacturer']+"_"+df1['Model']+"_"+df1['Type'])
df2.head()

Unnamed: 0,Manufacturer,Model,Type,Min.Price,Max.Price
Acura_Integra_Small,Acura,Integra,Small,12.9,18.8
missing_Legend_Midsize,missing,Legend,Midsize,29.2,38.7
Audi_90_Compact,Audi,90,Compact,25.9,32.3
Audi_100_Midsize,Audi,100,Midsize,missing,44.6
BMW_535i_Midsize,BMW,535i,Midsize,missing,missing
