In [37]:
import pandas as pd

In [38]:
# reads in data and merges them into 1 dataframe
intel = pd.read_csv('./datasets/dog_intelligence.csv')
info = pd.read_csv('./datasets/akc_breed_info.csv', encoding="ISO-8859-1")

In [39]:
intel.head(2)

Unnamed: 0,Breed,Classification,obey,reps_lower,reps_upper
0,Border Collie,Brightest Dogs,95%,1,4
1,Poodle,Brightest Dogs,95%,1,4


In [40]:
info.head(2)

Unnamed: 0,Breed,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs
0,Akita,26,28,80,120
1,Anatolian Sheepdog,27,29,100,150


In [41]:
# lowercase column names
intel.columns = intel.columns.str.lower()
info.columns = info.columns.str.lower()

In [42]:
# lowercase all text in dataframe
intel['breed'] = intel['breed'].str.lower()
info['breed'] = info['breed'].str.lower()

intel['classification'] = intel['classification'].str.lower()

In [43]:
intel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 141 entries, 0 to 140
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   breed           141 non-null    object
 1   classification  141 non-null    object
 2   obey            130 non-null    object
 3   reps_lower      141 non-null    int64 
 4   reps_upper      141 non-null    int64 
dtypes: int64(2), object(3)
memory usage: 5.6+ KB


In [44]:
info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154 entries, 0 to 153
Data columns (total 5 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   breed               154 non-null    object
 1   height_low_inches   154 non-null    object
 2   height_high_inches  154 non-null    object
 3   weight_low_lbs      154 non-null    object
 4   weight_high_lbs     154 non-null    object
dtypes: object(5)
memory usage: 6.1+ KB


In [45]:
# strips any extra whitespace in breed names
intel['breed'] = intel['breed'].str.strip()
info['breed'] = info['breed'].str.strip()

In [46]:
# function to check similarity of breed names
import difflib

def check_sim(dog_breed):
    """checks dog breeds in one data frame with dog breeds in another.
    Returns similarity score via difflib.SequenceMatcher and dog breed target resembles"""
    max_similarity = (0, "")
    for breed in list(info['breed']):
        sim_test = difflib.SequenceMatcher(None, dog_breed, breed).ratio()                  # scores similarity between all dog breeds in two lists
        if sim_test > max_similarity[0]:
            max_similarity = (sim_test, breed)
    return max_similarity[0], max_similarity[1]

# similarity test adapted from stack overflow user: duhaime
# Gene's code, adapted for dogs

In [47]:
# create dataframe of breed name similarity scores
scores = intel['breed'].map(check_sim)
scores_df = pd.DataFrame(list(scores), columns=['score', 'breed'])
scores_df.head()

Unnamed: 0,score,breed
0,1.0,border collie
1,0.75,toy poodle
2,1.0,german shepherd
3,1.0,golden retriever
4,1.0,doberman pinscher


In [48]:
# removed scores that are a perfect match, sorted the rest for reference
# breed names fixed in csv files
scores_df[scores_df['score'] != 1].sort_values(by='score', ascending=False)

Unnamed: 0,score,breed
101,0.888889,wirehair fox terrier
58,0.857143,australian terrier
82,0.816327,cavalier king charles spaniel
5,0.787879,belgian sheepdog
63,0.785714,irish terrier
86,0.780488,black russian terrier
109,0.769231,fox terrier
11,0.764706,giant schnauzer
1,0.75,toy poodle
59,0.705882,miniature poodle


In [49]:
# merge intel and info data frames
breeds = pd.merge(intel, info, on='breed')
breeds.head(2)

Unnamed: 0,breed,classification,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs
0,border collie,brightest dogs,95%,1,4,19,21,40,40
1,german shepherd,brightest dogs,95%,1,4,22,26,75,90


In [50]:
breeds.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 126 entries, 0 to 125
Data columns (total 9 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   breed               126 non-null    object
 1   classification      126 non-null    object
 2   obey                116 non-null    object
 3   reps_lower          126 non-null    int64 
 4   reps_upper          126 non-null    int64 
 5   height_low_inches   126 non-null    object
 6   height_high_inches  126 non-null    object
 7   weight_low_lbs      126 non-null    object
 8   weight_high_lbs     126 non-null    object
dtypes: int64(2), object(7)
memory usage: 9.8+ KB


In [51]:
# fills null values with 10%
breeds.fillna('10%', inplace=True)

In [52]:
# converts % into a float
breeds['obey'] = breeds['obey'].map(lambda x: float(x.split('%')[0])/100)

In [53]:
breeds.head(2)

Unnamed: 0,breed,classification,obey,reps_lower,reps_upper,height_low_inches,height_high_inches,weight_low_lbs,weight_high_lbs
0,border collie,brightest dogs,0.95,1,4,19,21,40,40
1,german shepherd,brightest dogs,0.95,1,4,22,26,75,90


In [54]:
breeds.to_csv('./datasets/breeds_info_clean.csv', index=False)