In [4]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## Load two datasets

In [5]:
pheno = pd.read_csv("ac_pheno.txt", sep="\t", na_values="NA")

In [6]:
pheno.describe()

Unnamed: 0,PIT,Length,Weight,Tank,Site
count,2862.0,2838.0,2845.0,2852.0,2852.0
mean,952706.2,463.836152,1648.48225,2.934432,1.461781
std,534319.1,39.908756,529.221099,1.155825,0.498625
min,915581.0,305.0,306.0,1.0,1.0
25%,916798.2,440.0,1286.0,2.0,1.0
50%,918000.5,465.0,1600.0,3.0,1.0
75%,919317.8,490.0,1950.0,4.0,2.0
max,9202514.0,590.0,3942.0,4.0,2.0


In [7]:
pheno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   PIT     2862 non-null   int64  
 1   Length  2838 non-null   float64
 2   Weight  2845 non-null   float64
 3   Tank    2852 non-null   float64
 4   Sex     2852 non-null   object 
 5   Site    2852 non-null   float64
dtypes: float64(4), int64(1), object(1)
memory usage: 134.3+ KB


In [8]:
pedigree = pd.read_csv("ac_ped.txt", sep="\t")

In [9]:
pedigree.head()

Unnamed: 0,Id,Sire,Dam,Year_Class,Selected_gen
0,478665,0,0,2013,7
1,478620,0,0,2013,7
2,478601,02F49B,01FD38,2013,7
3,478656,02F49B,01FD38,2013,7
4,478671,02F49B,01FD38,2013,7


In [10]:
pedigree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8362 entries, 0 to 8361
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            8362 non-null   int64 
 1   Sire          8362 non-null   object
 2   Dam           8362 non-null   object
 3   Year_Class    8362 non-null   int64 
 4   Selected_gen  8362 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 326.8+ KB


## Change data type

In [11]:
pheno['Tank'].astype('category')

0       1.0
1       1.0
2       1.0
3       4.0
4       4.0
       ... 
2857    2.0
2858    2.0
2859    2.0
2860    2.0
2861    2.0
Name: Tank, Length: 2862, dtype: category
Categories (4, float64): [1.0, 2.0, 3.0, 4.0]

In [12]:
pheno[['PIT','Tank','Sex','Site']] = pheno[['PIT','Tank','Sex','Site']].astype('category')

In [13]:
pheno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype   
---  ------  --------------  -----   
 0   PIT     2862 non-null   category
 1   Length  2838 non-null   float64 
 2   Weight  2845 non-null   float64 
 3   Tank    2852 non-null   category
 4   Sex     2852 non-null   category
 5   Site    2852 non-null   category
dtypes: category(4), float64(2)
memory usage: 145.6 KB


## Missing data

Will create first a toy dataframe

In [14]:
df = pd.DataFrame(np.random.randn(10,5))
df.iloc[2:3,2:5] = np.nan
df.iloc[3,:] = np.nan
df.iloc[:,4] = np.nan
df

Unnamed: 0,0,1,2,3,4
0,0.417317,0.316734,0.517525,1.298876,
1,2.544269,0.355959,0.895502,0.388734,
2,-0.914559,-0.309111,,,
3,,,,,
4,-0.711255,-0.040933,-0.798967,-0.050887,
5,0.368484,-0.356796,-1.585723,1.259869,
6,-1.888954,0.170213,-0.106694,-0.195518,
7,2.2169,-1.169001,1.174276,-0.222852,
8,-1.326401,-0.665227,1.164508,0.09125,
9,0.070737,-0.53786,1.3323,0.576472,


If you want to inspect whether a row or a column is full of missing data you could try the following.

In [32]:
for column in df.columns:
    if df[column].isnull().all():
        print(f"Column {column} is full of missing values")
    else:
        print(f"Column {column} has non missing data")

Column 0 has non missing data
Column 1 has non missing data
Column 2 has non missing data
Column 3 has non missing data
Column 4 is full of missing values


In [33]:
[print(f"Row {i} is full of missing data") if df.iloc[i,:].isnull().all() else print(f"Row {i} has data") 
 for i in range(0,df.shape[0])];

Row 0 has data
Row 1 has data
Row 2 has data
Row 3 is full of missing data
Row 4 has data
Row 5 has data
Row 6 has data
Row 7 has data
Row 8 has data
Row 9 has data


In [74]:
df.isnull()

Unnamed: 0,0,1,2,3,4
0,False,False,False,False,True
1,False,False,False,False,True
2,False,False,True,True,True
3,True,True,True,True,True
4,False,False,False,False,True
5,False,False,False,False,True
6,False,False,False,False,True
7,False,False,False,False,True
8,False,False,False,False,True
9,False,False,False,False,True


In [79]:
df_na = df.copy()
df_na.dropna()

Unnamed: 0,0,1,2,3,4


The previous will drop any row that contains a missing value. We can be more flexible

In [76]:
df_na.dropna(how='all')

Unnamed: 0,0,1,2,3,4
0,0.016668,0.760506,-1.725937,-0.630309,
1,-1.020664,1.134737,-0.636954,0.279499,
2,1.268727,1.167512,,,
4,-0.265961,0.342777,-0.2241,-0.304493,
5,-0.279759,2.422095,2.052635,0.464536,
6,1.918913,0.704934,-0.367343,1.654645,
7,0.479391,-0.138547,0.017585,-1.90365,
8,0.214193,0.401736,-0.220801,-0.484547,
9,0.739941,0.372546,0.409929,0.616987,


In [80]:
df_na.dropna(axis="columns", how='all')

Unnamed: 0,0,1,2,3
0,0.016668,0.760506,-1.725937,-0.630309
1,-1.020664,1.134737,-0.636954,0.279499
2,1.268727,1.167512,,
3,,,,
4,-0.265961,0.342777,-0.2241,-0.304493
5,-0.279759,2.422095,2.052635,0.464536
6,1.918913,0.704934,-0.367343,1.654645
7,0.479391,-0.138547,0.017585,-1.90365
8,0.214193,0.401736,-0.220801,-0.484547
9,0.739941,0.372546,0.409929,0.616987


In [81]:
df_na.dropna(thresh=3)

Unnamed: 0,0,1,2,3,4
0,0.016668,0.760506,-1.725937,-0.630309,
1,-1.020664,1.134737,-0.636954,0.279499,
4,-0.265961,0.342777,-0.2241,-0.304493,
5,-0.279759,2.422095,2.052635,0.464536,
6,1.918913,0.704934,-0.367343,1.654645,
7,0.479391,-0.138547,0.017585,-1.90365,
8,0.214193,0.401736,-0.220801,-0.484547,
9,0.739941,0.372546,0.409929,0.616987,


### Filling missing value

In [82]:
df_na.fillna(0)

Unnamed: 0,0,1,2,3,4
0,0.016668,0.760506,-1.725937,-0.630309,0.0
1,-1.020664,1.134737,-0.636954,0.279499,0.0
2,1.268727,1.167512,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0
4,-0.265961,0.342777,-0.2241,-0.304493,0.0
5,-0.279759,2.422095,2.052635,0.464536,0.0
6,1.918913,0.704934,-0.367343,1.654645,0.0
7,0.479391,-0.138547,0.017585,-1.90365,0.0
8,0.214193,0.401736,-0.220801,-0.484547,0.0
9,0.739941,0.372546,0.409929,0.616987,0.0


In [83]:
df_na.fillna(method='ffill')

Unnamed: 0,0,1,2,3,4
0,0.016668,0.760506,-1.725937,-0.630309,
1,-1.020664,1.134737,-0.636954,0.279499,
2,1.268727,1.167512,-0.636954,0.279499,
3,1.268727,1.167512,-0.636954,0.279499,
4,-0.265961,0.342777,-0.2241,-0.304493,
5,-0.279759,2.422095,2.052635,0.464536,
6,1.918913,0.704934,-0.367343,1.654645,
7,0.479391,-0.138547,0.017585,-1.90365,
8,0.214193,0.401736,-0.220801,-0.484547,
9,0.739941,0.372546,0.409929,0.616987,


In [84]:
pheno['Length'].isnull()

0       False
1       False
2       False
3       False
4       False
        ...  
2857    False
2858    False
2859    False
2860    False
2861    False
Name: Length, Length: 2862, dtype: bool

In [85]:
sum(pheno['Length'].isnull())

24

In [86]:
pheno['Length'].isnull().any()

True

In [87]:
pheno['Length'].notnull()

0       True
1       True
2       True
3       True
4       True
        ... 
2857    True
2858    True
2859    True
2860    True
2861    True
Name: Length, Length: 2862, dtype: bool

In [88]:
sum(pheno['Length'].notnull())

2838

In [89]:
pheno.isnull()

Unnamed: 0,PIT,Length,Weight,Tank,Sex,Site
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
2857,False,False,False,False,False,False
2858,False,False,False,False,False,False
2859,False,False,False,False,False,False
2860,False,False,False,False,False,False


In [90]:
pheno_clean = pheno.copy()

In [91]:
pheno_clean.dropna(inplace=True)
pheno_clean.isna().any()

PIT       False
Length    False
Weight    False
Tank      False
Sex       False
Site      False
dtype: bool

## Replace values

In [92]:
pedigree.head()

Unnamed: 0,Id,Sire,Dam,Year_Class,Selected_gen
0,478665,0,0,2013,7
1,478620,0,0,2013,7
2,478601,02F49B,01FD38,2013,7
3,478656,02F49B,01FD38,2013,7
4,478671,02F49B,01FD38,2013,7


In [93]:
pedigree['Sire'].replace(0,np.nan)

0            0
1            0
2       02F49B
3       02F49B
4       02F49B
         ...  
8357    597579
8358    597579
8359    597579
8360    597579
8361    597579
Name: Sire, Length: 8362, dtype: object

In [95]:
pedigree.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8362 entries, 0 to 8361
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Id            8362 non-null   int64 
 1   Sire          8362 non-null   object
 2   Dam           8362 non-null   object
 3   Year_Class    8362 non-null   int64 
 4   Selected_gen  8362 non-null   int64 
dtypes: int64(3), object(2)
memory usage: 326.8+ KB


In [96]:
pedigree['Sire'].replace('0',np.nan)

0          NaN
1          NaN
2       02F49B
3       02F49B
4       02F49B
         ...  
8357    597579
8358    597579
8359    597579
8360    597579
8361    597579
Name: Sire, Length: 8362, dtype: object

In [31]:
pedigree[['Sire','Dam']].replace('0',np.nan)

Unnamed: 0,Sire,Dam
0,,
1,,
2,02F49B,01FD38
3,02F49B,01FD38
4,02F49B,01FD38
...,...,...
8357,597579,900730
8358,597579,900730
8359,597579,900730
8360,597579,900730


In [98]:
pedigree.loc[pedigree['Sire']=='0','Sire'] = np.nan

In [99]:
pedigree.head()

Unnamed: 0,Id,Sire,Dam,Year_Class,Selected_gen
0,478665,,0,2013,7
1,478620,,0,2013,7
2,478601,02F49B,01FD38,2013,7
3,478656,02F49B,01FD38,2013,7
4,478671,02F49B,01FD38,2013,7


## Detect duplications


In [100]:
pedigree['Id'].count()

8362

In [101]:
pedigree['Id'].nunique()

8359

In [102]:
pedigree.nunique()

Id              8359
Sire              74
Dam              105
Year_Class         2
Selected_gen       2
dtype: int64

In [103]:
pedigree['Id'].duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
8357    False
8358    False
8359    False
8360    False
8361    False
Name: Id, Length: 8362, dtype: bool

In [104]:
pedigree['Id'].duplicated().any()

True

In [105]:
pedigree['Id'].drop_duplicates()

0       478665
1       478620
2       478601
3       478656
4       478671
         ...  
8357    919218
8358    919013
8359    919448
8360    919246
8361    919420
Name: Id, Length: 8359, dtype: int64

In [106]:
pedigree.drop_duplicates(inplace=True)

In [107]:
pedigree.drop_duplicates?

[0;31mSignature:[0m
[0mpedigree[0m[0;34m.[0m[0mdrop_duplicates[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0msubset[0m[0;34m:[0m [0;34m'Hashable | Sequence[Hashable] | None'[0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mkeep[0m[0;34m:[0m [0;34m'DropKeep'[0m [0;34m=[0m [0;34m'first'[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0minplace[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mignore_index[0m[0;34m:[0m [0;34m'bool'[0m [0;34m=[0m [0;32mFalse[0m[0;34m,[0m[0;34m[0m
[0;34m[0m[0;34m)[0m [0;34m->[0m [0;34m'DataFrame | None'[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return DataFrame with duplicate rows removed.

Considering certain columns is optional. Indexes, including time indexes
are ignored.

Parameters
----------
subset : column label or sequence of labels, optional
    Only consider certain column

In [108]:
pedigree['Id'].duplicated().any()

False

## Creating new variables

In [109]:
pheno.head()

Unnamed: 0,PIT,Length,Weight,Tank,Sex,Site
0,919540,465.0,1514.0,1.0,U,1.0
1,918025,455.0,1250.0,1.0,U,1.0
2,917803,405.0,937.0,1.0,U,1.0
3,918763,505.0,2667.0,4.0,M,2.0
4,917365,500.0,2204.0,4.0,U,2.0


We want to create a category showing the growth peformance of the animals. Let's say animals can be classified in terms of their weight to the following:

- below 500 as slow growers
- above 500 and below 1500 as medium growers 
- above 1500 fast growers

In [44]:
thresholds = [300,500, 1500, 4000]

In [45]:
group_names = ['slow growing','medium growing', 'fast growing']

In [110]:
pd.cut(pheno['Weight'], bins=thresholds, labels=group_names)

0         fast growing
1       medium growing
2       medium growing
3         fast growing
4         fast growing
             ...      
2857      fast growing
2858    medium growing
2859      fast growing
2860    medium growing
2861    medium growing
Name: Weight, Length: 2862, dtype: category
Categories (3, object): ['slow growing' < 'medium growing' < 'fast growing']

In [111]:
pheno['Growth_rate'] = pd.cut(pheno['Weight'], bins=thresholds, labels=group_names)
pheno.head()

Unnamed: 0,PIT,Length,Weight,Tank,Sex,Site,Growth_rate
0,919540,465.0,1514.0,1.0,U,1.0,fast growing
1,918025,455.0,1250.0,1.0,U,1.0,medium growing
2,917803,405.0,937.0,1.0,U,1.0,medium growing
3,918763,505.0,2667.0,4.0,M,2.0,fast growing
4,917365,500.0,2204.0,4.0,U,2.0,fast growing


In [112]:
pheno['Growth_rate'].value_counts()

Growth_rate
fast growing      1648
medium growing    1191
slow growing         6
Name: count, dtype: int64

In [113]:
pheno['Growth_rate'] = pd.cut(pheno['Weight'], bins=3, labels=group_names)
pheno.head()

Unnamed: 0,PIT,Length,Weight,Tank,Sex,Site,Growth_rate
0,919540,465.0,1514.0,1.0,U,1.0,slow growing
1,918025,455.0,1250.0,1.0,U,1.0,slow growing
2,917803,405.0,937.0,1.0,U,1.0,slow growing
3,918763,505.0,2667.0,4.0,M,2.0,medium growing
4,917365,500.0,2204.0,4.0,U,2.0,medium growing


In [114]:
pheno['Growth_rate'].value_counts()

Growth_rate
medium growing    1498
slow growing      1235
fast growing       112
Name: count, dtype: int64

In [115]:
pheno['Growth_rate'] = pd.qcut(pheno['Weight'], 3, labels=group_names)
pheno.head()

Unnamed: 0,PIT,Length,Weight,Tank,Sex,Site,Growth_rate
0,919540,465.0,1514.0,1.0,U,1.0,medium growing
1,918025,455.0,1250.0,1.0,U,1.0,slow growing
2,917803,405.0,937.0,1.0,U,1.0,slow growing
3,918763,505.0,2667.0,4.0,M,2.0,fast growing
4,917365,500.0,2204.0,4.0,U,2.0,fast growing


In [116]:
pheno['Growth_rate'].value_counts()

Growth_rate
slow growing      949
medium growing    949
fast growing      947
Name: count, dtype: int64

In case we have a column of categorical values where it makes more sense to merge categories the following could be of value.

In [117]:
pheno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PIT          2862 non-null   category
 1   Length       2838 non-null   float64 
 2   Weight       2845 non-null   float64 
 3   Tank         2852 non-null   category
 4   Sex          2852 non-null   category
 5   Site         2852 non-null   category
 6   Growth_rate  2845 non-null   category
dtypes: category(5), float64(2)
memory usage: 148.6 KB


In [119]:
new_cat = {"slow growing" : "discard", "medium growing" : "keep", "fast growing" : "keep"}

pheno["Manage"] = pheno["Growth_rate"].map(new_cat)


In [59]:
pheno

Unnamed: 0,PIT,Length,Weight,Tank,Sex,Site,Growth_rate,Manage
0,919540,465.0,1514.0,1.0,U,1.0,medium growing,keep
1,918025,455.0,1250.0,1.0,U,1.0,slow growing,discard
2,917803,405.0,937.0,1.0,U,1.0,slow growing,discard
3,918763,505.0,2667.0,4.0,M,2.0,fast growing,keep
4,917365,500.0,2204.0,4.0,U,2.0,fast growing,keep
...,...,...,...,...,...,...,...,...
2857,920227,475.0,1565.0,2.0,U,1.0,medium growing,keep
2858,915897,440.0,1300.0,2.0,U,1.0,slow growing,discard
2859,917341,450.0,1510.0,2.0,U,1.0,medium growing,keep
2860,915861,450.0,1464.0,2.0,U,1.0,medium growing,keep


In [120]:
pheno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PIT          2862 non-null   category
 1   Length       2838 non-null   float64 
 2   Weight       2845 non-null   float64 
 3   Tank         2852 non-null   category
 4   Sex          2852 non-null   category
 5   Site         2852 non-null   category
 6   Growth_rate  2845 non-null   category
 7   Manage       2845 non-null   object  
dtypes: category(5), float64(2), object(1)
memory usage: 170.9+ KB


In [121]:
pheno["Manage"] = pheno["Manage"].astype("category")
pheno.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2862 entries, 0 to 2861
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype   
---  ------       --------------  -----   
 0   PIT          2862 non-null   category
 1   Length       2838 non-null   float64 
 2   Weight       2845 non-null   float64 
 3   Tank         2852 non-null   category
 4   Sex          2852 non-null   category
 5   Site         2852 non-null   category
 6   Growth_rate  2845 non-null   category
 7   Manage       2845 non-null   category
dtypes: category(6), float64(2)
memory usage: 151.5 KB


## Exercises 

### Exercise 1

For this exercise we will work with the `pedigree.txt` file.

* Change the type of the first column to string.
* Check whether missing data exist using two different approaches.
* Remove any row that has a missing value.
* Remove rows only if there are two or more missing values.
* Fill the missing values using the next valid observation.
* Fill the missing values using the previous valid observation.
* Check for the existence of duplicated values in the first column. In case there are duplicates estimate how many. Remove the duplicates.
* A colleague tells you that also the records with `0` denote missing data. Make the appropriate changes so that it is clear that those values denote also missing data.
* After removing all missing values and duplicates create a barplot depicting the number of offspring of each sire.
* Same as the previous but this time depicting the size of each family.
* Create a new column that for offspring originating from the first 20 sires and dams has the value `early` and for the rest `late`. Check how many records are on each category.

### Exercise 2

For this exercise we will work with the `pheno.txt` file.

* Change the type of the first column to string. Missing data are coded as `0`. Check how many missing data exist for each column.
* Check for duplications in the first column. If any remove them.
* For the numeric columns fill the missing values with the column mean.
* Create a new column where the animals that have a `weight_final` above 200 have the value `fast_grower`, the animals with a value between 100 - 200 as `medium_grower` and the rest `slow grower`. Check how frequent is each of the above three categories and based on that create a barplot.
* Create a new column where the top 1/3 of the animals in terms of weight difference (weight_final - weight_initial) are marked as `A`, the next 1/3 as `B` and the rest as `C`.
