In [1]:
# Import packages
import numpy as np
import pandas as pd

In [2]:
# Create an array
my_array = np.arange(1,21).reshape(5, 4)  # Create an array with the element from 1 to 20 in 5 rows and 4 columns
my_array   # show array

array([[ 1,  2,  3,  4],
       [ 5,  6,  7,  8],
       [ 9, 10, 11, 12],
       [13, 14, 15, 16],
       [17, 18, 19, 20]])

In [3]:
# Create row labels
row_labels = 'A B C D E'.split()   
row_labels   # show labels

['A', 'B', 'C', 'D', 'E']

In [4]:
# Create column labels
col_labels = 'odd1 even1 odd2 even2'.split()
col_labels   # show labels

['odd1', 'even1', 'odd2', 'even2']

In [5]:
# Create a data frame with array, row and columns labels
df_array = pd.DataFrame(my_array,row_labels,col_labels)
df_array

Unnamed: 0,odd1,even1,odd2,even2
A,1,2,3,4
B,5,6,7,8
C,9,10,11,12
D,13,14,15,16
E,17,18,19,20


In [8]:
# Add two sum columns to data frame
df_array['Odd sum'] = df_array['odd1'] + df_array['odd2']  # add two columns with odd numbers
df_array['Even sum'] = df_array['even1'] + df_array['even2'] # add two columns with even numbers
df_array  # show data frame

Unnamed: 0,odd1,even1,odd2,even2,Odd sum,Even sum
A,1,2,3,4,4,6
B,5,6,7,8,12,14
C,9,10,11,12,20,22
D,13,14,15,16,28,30
E,17,18,19,20,36,38


In [9]:
# Grabe elements greater than five in the data frame
df_array[df_array>5]

Unnamed: 0,odd1,even1,odd2,even2,Odd sum,Even sum
A,,,,,,6
B,,6.0,7.0,8.0,12.0,14
C,9.0,10.0,11.0,12.0,20.0,22
D,13.0,14.0,15.0,16.0,28.0,30
E,17.0,18.0,19.0,20.0,36.0,38


In [10]:
# Remove row C through the first column
df_array[df_array['odd1']!=9]

Unnamed: 0,odd1,even1,odd2,even2,Odd sum,Even sum
A,1,2,3,4,4,6
B,5,6,7,8,12,14
D,13,14,15,16,28,30
E,17,18,19,20,36,38


In [12]:
# Remove row through the fourth column
df_array[df_array['even2']!=12]

Unnamed: 0,odd1,even1,odd2,even2,Odd sum,Even sum
A,1,2,3,4,4,6
B,5,6,7,8,12,14
D,13,14,15,16,28,30
E,17,18,19,20,36,38


In [13]:
# Remove rows D and E through 'even2'
df_array[df_array['even2']<=12]

Unnamed: 0,odd1,even1,odd2,even2,Odd sum,Even sum
A,1,2,3,4,4,6
B,5,6,7,8,12,14
C,9,10,11,12,20,22


In [14]:
# Select elements which are greater than 1 in the first column and less than 22 in the last column
df_array[(df_array['odd1']>1)&(df_array['Even sum']<22)]

Unnamed: 0,odd1,even1,odd2,even2,Odd sum,Even sum
B,5,6,7,8,12,14


In [15]:
# Create a dictionary
dic = {'X':[1,2,np.nan],'Y':[4,np.nan,np.nan],'Z':[7,8,9]}
dic # show dictionary

{'X': [1, 2, nan], 'Y': [4, nan, nan], 'Z': [7, 8, 9]}

In [16]:
# Create row labels
row_labels = 'A B C'.split()
row_labels  # show row labels

['A', 'B', 'C']

In [17]:
# Create a data frame with dictionary and row labels
df = pd.DataFrame(dic,row_labels)
df  # show data frame

Unnamed: 0,X,Y,Z
A,1.0,4.0,7
B,2.0,,8
C,,,9


In [18]:
# Remove rows which contains 'NaN'
df.dropna()   # axis default is 0, rows

Unnamed: 0,X,Y,Z
A,1.0,4.0,7


In [19]:
# Remove columns which contians 'NaN'
df.dropna(axis=1)

Unnamed: 0,Z
A,7
B,8
C,9


In [20]:
# Drop rows with less than 2 actual values
df.dropna(thresh=2)

Unnamed: 0,X,Y,Z
A,1.0,4.0,7
B,2.0,,8


In [22]:
# Fill up 'NaN' with X
df.fillna('X')

Unnamed: 0,X,Y,Z
A,1,4,7
B,2,X,8
C,X,X,9


In [28]:
# Get column 'X' and put it into a variable
xcol_var = df['X']
xcol_var   # show values

A    1.0
B    2.0
C    NaN
Name: X, dtype: float64

In [29]:
# Get the mean value of the column 
xcol_mean = xcol_var.mean() 
xcol_mean  # show mean value

1.5

In [30]:
# Fill the missing data in the column with the mean value
xcol_var.fillna(value=xcol_mean)

A    1.0
B    2.0
C    1.5
Name: X, dtype: float64

In [24]:
# Replace missing values with mean in column 'X'
df['X'].fillna(value=df['X'].mean())

A    1.0
B    2.0
C    1.5
Name: X, dtype: float64

In [31]:
# Automate the entire process
for i in 'X Y Z'.split():    # loop
    df[i].fillna(value=df[i].mean(),inplace=True)
df # show

Unnamed: 0,X,Y,Z
A,1.0,4.0,7
B,2.0,4.0,8
C,1.5,4.0,9


In [32]:
# Create a dictionary for company XYZ sales information
data={'Sales Person':'Same Charlie Amy Vanessa Carl Sarah'.split(),
      'Product':'Hp Hp Apple Apple Dell Dell'.split(),
      'Sales':[200,120,340,124,243,350]}
data # show data

{'Sales Person': ['Same', 'Charlie', 'Amy', 'Vanessa', 'Carl', 'Sarah'],
 'Product': ['Hp', 'Hp', 'Apple', 'Apple', 'Dell', 'Dell'],
 'Sales': [200, 120, 340, 124, 243, 350]}

In [33]:
# Create row labels
row_labels = list(range(1,7))  # row names from 1-6
row_labels  # show row labels

[1, 2, 3, 4, 5, 6]

In [34]:
# Create a data frame
df = pd.DataFrame(data, row_labels)
df  # show data frame

Unnamed: 0,Sales Person,Product,Sales
1,Same,Hp,200
2,Charlie,Hp,120
3,Amy,Apple,340
4,Vanessa,Apple,124
5,Carl,Dell,243
6,Sarah,Dell,350


In [35]:
# Find sales infromation by product
df.groupby('Product').sum()

Unnamed: 0_level_0,Sales
Product,Unnamed: 1_level_1
Apple,464
Dell,593
Hp,320


In [36]:
# Find out how many salespersons sell how many products
df.groupby('Product').count()

Unnamed: 0_level_0,Sales Person,Sales
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
Apple,2,2
Dell,2,2
Hp,2,2


In [37]:
# Get more details about sales
df.groupby('Product').describe()

Unnamed: 0_level_0,Sales,Sales,Sales,Sales,Sales,Sales,Sales,Sales
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
Product,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
Apple,2.0,232.0,152.735065,124.0,178.0,232.0,286.0,340.0
Dell,2.0,296.5,75.660426,243.0,269.75,296.5,323.25,350.0
Hp,2.0,160.0,56.568542,120.0,140.0,160.0,180.0,200.0
