In [1]:
import pandas as pd

### Two data structures of DataFrames
* Series
* DataFrame

In [2]:
s1 = pd.Series([1,2,3,4],index=['a','b','c','d'])

In [3]:
s1

a    1
b    2
c    3
d    4
dtype: int64

In [4]:
s2 = pd.Series([11,12,13,14],index=['a','b','c','d'])

In [5]:
s2

a    11
b    12
c    13
d    14
dtype: int64

In [7]:
df = pd.DataFrame({'C1':s1, 'C2':s2})

In [9]:
hr_df = pd.read_csv('data-science-complete-tutorial/Data/HR_comma_sep.csv.txt')

In [17]:
hr_df = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/HR_comma_sep.csv.txt')

In [11]:
sales_df = pd.read_excel('data-science-complete-tutorial/Data/sales_info.xlsx')

### Descriptive Analysis

In [13]:
hr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


### Extracting subset of columns based on type

In [15]:
hr_df.select_dtypes(include=['int64']).head(5)

Unnamed: 0,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
0,2,157,3,0,1,0
1,5,262,6,0,1,0
2,7,272,4,0,1,0
3,5,223,5,0,1,0
4,2,159,3,0,1,0


In [16]:
hr_df.select_dtypes(exclude=['int64']).head(5)

Unnamed: 0,satisfaction_level,last_evaluation,sales,salary
0,0.38,0.53,sales,low
1,0.8,0.86,sales,medium
2,0.11,0.88,sales,medium
3,0.72,0.87,sales,low
4,0.37,0.52,sales,low


In [18]:
hr_df.select_dtypes(include=['object']).head(5)

Unnamed: 0,sales,salary
0,sales,low
1,sales,medium
2,sales,medium
3,sales,low
4,sales,low


### Select subset of columns

In [20]:
type(hr_df.salary)

pandas.core.series.Series

In [23]:
type(hr_df['salary'])

pandas.core.series.Series

In [21]:
type(hr_df[['salary','left']])

pandas.core.frame.DataFrame

In [22]:
hr_df[['salary','left']].head()

Unnamed: 0,salary,left
0,low,1
1,medium,1
2,medium,1
3,low,1
4,low,1


In [24]:
type(hr_df[['salary']])

pandas.core.frame.DataFrame

### Describing Data
* Only work for numerical cols

In [26]:
hr_df.describe()

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years
count,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0,14999.0
mean,0.612834,0.716102,3.803054,201.050337,3.498233,0.14461,0.238083,0.021268
std,0.248631,0.171169,1.232592,49.943099,1.460136,0.351719,0.425924,0.144281
min,0.09,0.36,2.0,96.0,2.0,0.0,0.0,0.0
25%,0.44,0.56,3.0,156.0,3.0,0.0,0.0,0.0
50%,0.64,0.72,4.0,200.0,3.0,0.0,0.0,0.0
75%,0.82,0.87,5.0,245.0,4.0,0.0,0.0,0.0
max,1.0,1.0,7.0,310.0,10.0,1.0,1.0,1.0


In [28]:
hr_df.satisfaction_level.median()

0.64

In [29]:
hr_df.left.unique()

array([1, 0])

In [30]:
hr_df.Work_accident.unique()

array([0, 1])

In [31]:
hr_df.sales.unique()

array(['sales', 'accounting', 'hr', 'technical', 'support', 'management',
       'IT', 'product_mng', 'marketing', 'RandD'], dtype=object)

In [33]:
#hr_df = hr_df.rename(columns={'sales':'department'})
hr_df.rename(columns={'sales':'department'}, inplace=True)

In [34]:
hr_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
department               14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.1+ MB


* Distribution of people across different departments

In [35]:
hr_df.department.value_counts()

sales          4140
technical      2720
support        2229
IT             1227
product_mng     902
marketing       858
RandD           787
accounting      767
hr              739
management      630
Name: department, dtype: int64

In [36]:
hr_df.salary.value_counts()

low       7316
medium    6446
high      1237
Name: salary, dtype: int64

### Saving changed data to file system

In [39]:
hr_df.to_csv('~/Corrected_Data.csv')

### Identifying Missing Values

In [43]:
titanic_df = pd.read_csv('https://raw.githubusercontent.com/edyoda/data-science-complete-tutorial/master/Data/titanic-train.csv.txt', index_col='PassengerId')

In [45]:
titanic_df.head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [46]:
titanic_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 891 entries, 1 to 891
Data columns (total 11 columns):
Survived    891 non-null int64
Pclass      891 non-null int64
Name        891 non-null object
Sex         891 non-null object
Age         714 non-null float64
SibSp       891 non-null int64
Parch       891 non-null int64
Ticket      891 non-null object
Fare        891 non-null float64
Cabin       204 non-null object
Embarked    889 non-null object
dtypes: float64(2), int64(4), object(5)
memory usage: 83.5+ KB


In [53]:
titanic_df[titanic_df.Age.isnull()].head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
18,1,2,"Williams, Mr. Charles Eugene",male,,0,0,244373,13.0,,S
20,1,3,"Masselmani, Mrs. Fatima",female,,0,0,2649,7.225,,C
27,0,3,"Emir, Mr. Farred Chehab",male,,0,0,2631,7.225,,C
29,1,3,"O'Dwyer, Miss. Ellen ""Nellie""",female,,0,0,330959,7.8792,,Q


In [54]:
titanic_df[titanic_df.Age.notnull()].head()

Unnamed: 0_level_0,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
PassengerId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S
