# Video 1. Python Basics for R&D Scientists - Getting Started and Basic Data methods
---------

## Step 1. Install Python Jupyter using the Anaconda Distribution.  
**Note this will take awhile** 
## Step 2. Open Anaconda Navigator and select **Jupyter Lab**
![Screen Shot 2022-10-03 at 9.54.13 PM.png](attachment:a1784692-8ae0-4b86-af7c-e4757852e45c.png)


##  Creating Python Enviroment

In [4]:
#set up Python enviroment and load test dataset
import pandas as pd 
import seaborn as sns

df = sns.load_dataset('iris')

### Displaying data in dataframe

In [2]:
df # previews all data

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [3]:
df.head() # shows the first 5 rows by default

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
df.tail() # shows the last 5 rows by default

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica
149,5.9,3.0,5.1,1.8,virginica


In [5]:
df.sample(5) # shows n-samples

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
99,5.7,2.8,4.1,1.3,versicolor
117,7.7,3.8,6.7,2.2,virginica
122,7.7,2.8,6.7,2.0,virginica
29,4.7,3.2,1.6,0.2,setosa
108,6.7,2.5,5.8,1.8,virginica


In [6]:
df.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [7]:
df.index

RangeIndex(start=0, stop=150, step=1)

### Summary statistics and meta data

In [8]:
df.describe() # returns basic descriptive statistics for numeric columns by default

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [9]:
df.describe(include = [float, object]) #passing include will show specified data types

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
count,150.0,150.0,150.0,150.0,150
unique,,,,,3
top,,,,,setosa
freq,,,,,50
mean,5.843333,3.057333,3.758,1.199333,
std,0.828066,0.435866,1.765298,0.762238,
min,4.3,2.0,1.0,0.1,
25%,5.1,2.8,1.6,0.3,
50%,5.8,3.0,4.35,1.3,
75%,6.4,3.3,5.1,1.8,


In [10]:
df.dtypes #shows dtypes

sepal_length    float64
sepal_width     float64
petal_length    float64
petal_width     float64
species          object
dtype: object

In [11]:
df.dtypes.value_counts()

float64    4
object     1
dtype: int64

In [12]:
df.select_dtypes('number')

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


In [13]:
df.shape

(150, 5)

###  Dataframe mathematical methods

In [14]:
df.mean()

  df.mean()


sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [15]:
df.select_dtypes('number').mean()

sepal_length    5.843333
sepal_width     3.057333
petal_length    3.758000
petal_width     1.199333
dtype: float64

In [16]:
df.max() # returns the max value for each columns 
# max strings/objects are based on alphabetical ordering

sepal_length          7.9
sepal_width           4.4
petal_length          6.9
petal_width           2.5
species         virginica
dtype: object

In [17]:
df.min()

sepal_length       4.3
sepal_width        2.0
petal_length       1.0
petal_width        0.1
species         setosa
dtype: object

## Selecting data from dataframe

In [47]:
df['species'] #select 'species' column

0         setosa
1         setosa
2         setosa
3         setosa
4         setosa
         ...    
145    virginica
146    virginica
147    virginica
148    virginica
149    virginica
Name: species, Length: 150, dtype: object

In [48]:
df['species'].value_counts() # return count for each unique value

setosa        50
versicolor    50
virginica     50
Name: species, dtype: int64

In [49]:
df['species'].nunique() # returns the number of unique values

3

* .iloc interger based selection
* .loc  index-name selection

In [20]:
df.iloc[0:5,:] #row, col
# equivalent to df.head() returns first 5 rows and all columns

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [21]:
df.iloc[:,0:3] # returns all rows and first 3 columns

Unnamed: 0,sepal_length,sepal_width,petal_length
0,5.1,3.5,1.4
1,4.9,3.0,1.4
2,4.7,3.2,1.3
3,4.6,3.1,1.5
4,5.0,3.6,1.4
...,...,...,...
145,6.7,3.0,5.2
146,6.3,2.5,5.0
147,6.5,3.0,5.2
148,6.2,3.4,5.4


In [22]:
df.loc[:,'petal_length':'species'] #retuns specified column range all rows

Unnamed: 0,petal_length,petal_width,species
0,1.4,0.2,setosa
1,1.4,0.2,setosa
2,1.3,0.2,setosa
3,1.5,0.2,setosa
4,1.4,0.2,setosa
...,...,...,...
145,5.2,2.3,virginica
146,5.0,1.9,virginica
147,5.2,2.0,virginica
148,5.4,2.3,virginica


In [23]:
petal_data = df.loc[:, 'petal_length' : 'petal_width']
petal_data.head()

Unnamed: 0,petal_length,petal_width
0,1.4,0.2
1,1.4,0.2
2,1.3,0.2
3,1.5,0.2
4,1.4,0.2


In [24]:
sepal_data = df.loc[:, 'sepal_length':'sepal_width']
sepal_data.head()

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6


In [25]:
target_data = df['species']
target_data.head()

0    setosa
1    setosa
2    setosa
3    setosa
4    setosa
Name: species, dtype: object

In [31]:
# merge data sets w/ common index
msmt_data = pd.merge(sepal_data, petal_data, left_index = True, right_index = True)
msmt_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [32]:
all_data = pd.merge(msmt_data, target_data, left_index = True, right_index= True)
all_data.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [33]:
all(df == all_data) # data frames are equal after merging data components

True