Format the file combined_species file
-----

Input file : combined_species_1.csv

In [1]:
import pandas as pd
import textacy
import re



In [2]:
base_inp_file ='combined_species_1.csv'
base_df = pd.read_csv(base_inp_file,index_col=0) 


In [3]:
def setup_locs_list():
        loc_list_file = 'country_list.txt'
        locs = []

        f = open(loc_list_file,'r') 
        with open(loc_list_file,'r')  as f:
            content = f.readlines()
            # you may also want to remove whitespace characters like `\n` at the end of each line
            locs = [x.strip() for x in content] 

        locs = [ l.lower() for l in locs]
        return locs
    
locs = setup_locs_list()

def check_location(token):
    global locs
    if token.lower() in locs :
        return True
    return False
    
def clean_cn_aux(cn):
    res = []
    try :
        st = cn.index('(')
        if st is not None :
            if '(' in cn :
                en = cn.index(')')
            else :
                en =-1
            inp = cn[st+1:en]
            inp = textacy.preprocess.normalize_whitespace(inp)
            inp = inp.split(',')

            fp  = cn[0:st]
            fp = fp.strip()
            
            for r in inp:
                r = r.strip()
                res.append(r)
            res.append(fp)

    except :
        res = [cn]
    res_1 = []
    for r in res:
        if check_location(r) == False:
            res_1.append(r)
    return res_1 


# ------------------------------------ #
def clean_up_common_names(row):
    if type(row['common_name']) != str :
        return None
    
    common_names = row['common_name'].lower()
    common_names = common_names.split(';')
    res = []
    for _s in common_names :
        _s = _s.replace('"','')
        _s = _s.replace('`','')
        tmp = clean_cn_aux(_s)
        res.extend(tmp)
    res = [t.strip() for t in res]
    res = ';'.join(res)
    return res

base_df['common_name'] = base_df.apply(clean_up_common_names,axis=1)


In [5]:
base_df.head(10)
base_df['is_tropical'] = None

def set_genus(row):
    scn = row['sc_name']
    gn = scn.split(' ')[0]
    return gn

base_df['genus'] = base_df.apply(set_genus,axis=1)   

All genus with coniferous plants will have conifers only
----------

In [6]:
def find_con_genus(df) :
    genus_list = []
    for i,row in df.iterrows():
        if row['is_coniferous'] == 1: 
            gn = row['genus']
            genus_list.append(gn)
    genus_list = list(set(genus_list))
    return genus_list
 

con_genus_list = find_con_genus(base_df)

def rectify_coniferous(row):
    global con_genus_list
    gn = row['genus']
    if gn in con_genus_list:
        return 1
    return row['is_coniferous']

base_df['is_coniferous'] = base_df.apply(rectify_coniferous,axis =1)


In [7]:
def set_species(row):
    scn = row['sc_name'] 
    scn = scn.split(' ')
    if len(scn) == 1 :
        return None
    elif len(scn) >= 2 :
        return scn[1]

def set_subsp(row):
    scn = row['sc_name'] 
    scn = scn.split(' ')
    if len(scn) <= 2 :
        return None
    else :
        return scn[2]

base_df['species'] = None
base_df['sub_sp'] = None
base_df['species'] = base_df.apply(set_species,axis = 1)
base_df['sub_sp'] = base_df.apply(set_subsp,axis = 1)

base_df.to_csv('combined_species_2.csv')

In [8]:
print (base_df.columns)
base_df.head(10)

Index(['entity_id', 'is_coniferous', 'sc_name', 'family', 'flag',
       'range_state', 'common_name', 'genus', 'is_tropical', 'species',
       'sub_sp'],
      dtype='object')


Unnamed: 0,entity_id,is_coniferous,sc_name,family,flag,range_state,common_name,genus,is_tropical,species,sub_sp
0,1,0.0,Acacia auriculiformis,Leguminosae,0.0,,ear-leaf acacia,Acacia,,auriculiformis,
1,2,0.0,Acacia crassicarpa,Leguminosae,1.0,,,Acacia,,crassicarpa,
2,3,0.0,Acacia decurrens,Leguminosae,,,,Acacia,,decurrens,
3,4,0.0,Acacia koa,Leguminosae,0.0,,koa;gray koa,Acacia,,koa,
4,5,0.0,Acacia mangium,Leguminosae,,,,Acacia,,mangium,
5,6,0.0,Acacia mearnsii,Leguminosae,,,,Acacia,,mearnsii,
6,7,0.0,Acacia melanoxylon,Leguminosae,,,,Acacia,,melanoxylon,
7,8,0.0,Acacia nilotica,Leguminosae,0.0,,sunt;qarad,Acacia,,nilotica,
8,9,0.0,Acacia pubescens,Leguminosae,,,,Acacia,,pubescens,
9,10,0.0,Acer campestre,Sapindaceae,0.0,,field maple,Acer,,campestre,


In [9]:
#Reorderthe columns
base_df = base_df[['entity_id',
                    'sc_name',
                    'family',
                    'genus',
                    'species',
                    'sub_sp' ,
                    'common_name',
                    'range_state',
                    'is_tropical',
                    'is_coniferous',
                    'flag']]
base_df = base_df.sort_values(by = ['genus','species','sub_sp'])

base_df.to_csv('combined_species_2.csv')
base_df.head(10)

Unnamed: 0,entity_id,sc_name,family,genus,species,sub_sp,common_name,range_state,is_tropical,is_coniferous,flag
24562,24563,Abarema abbottii,,Abarema,abbottii,,,,,,1.0
24560,24561,Abarema alexandri,,Abarema,alexandri,,,,,,0.0
18069,18070,Abarema bigemina,,Abarema,bigemina,,,,,,1.0
24552,24553,Abarema callejasii,,Abarema,callejasii,,,,,,1.0
24566,24567,Abarema centiflora,,Abarema,centiflora,,,,,,1.0
24550,24551,Abarema cochleata,,Abarema,cochleata,,,,,,1.0
24559,24560,Abarema cochliocarpos,,Abarema,cochliocarpos,,,,,,0.0
24547,24548,Abarema commutata,,Abarema,commutata,,,,,,0.0
24543,24544,Abarema curvicarpa,,Abarema,curvicarpa,,orelha de negro,,,,0.0
24557,24558,Abarema filamentosa,,Abarema,filamentosa,,,,,,1.0


In [2]:
import pandas as pd
import math
base_df = pd.read_csv('combined_species_2.csv',index_col=0)
print(base_df.columns)

scn_list = []
# Start Depuplication!
for i,row in base_df.iterrows():
    gn = row['genus'].strip()
    sp = row['species'].strip()
    scn = ' '.join([gn,sp])
    scn_list.append(scn)
    
scn_list = list(sorted(list(set(scn_list))))
scn_list = [ scn.split(' ') for scn in scn_list]

Index(['entity_id', 'sc_name', 'family', 'genus', 'species', 'sub_sp',
       'common_name', 'range_state', 'is_tropical', 'is_coniferous', 'flag'],
      dtype='object')


In [3]:
t =base_df.loc[(base_df['genus'] == 'Abies') & (base_df['species'] == 'procera') ]



In [8]:
new_cols = [ 'family', 'genus', 'species', 'sub_sp','common_name', 'range_state', 'is_tropical', 'is_coniferous', 'flag']
new_df = pd.DataFrame(columns=new_cols)

for scn in scn_list :
   
    tmp_df = base_df.loc[
        (base_df['genus'] == scn[0]) & (base_df['species'] == scn[1]) 
    ]
    if len(tmp_df) ==1 :
        # direct copy
        tmp_dict ={
            'family'  : list(tmp_df['family'])[0],
            'genus' : scn[0] ,
            'species' : scn[1],
            'sub_sp' : list(tmp_df['sub_sp'])[0],
            'common_name' : list(tmp_df['common_name'])[0],
            'range_state': list(tmp_df['range_state'])[0],
            'is_tropical' : list(tmp_df['is_tropical'])[0],
            'is_coniferous': list(tmp_df['is_coniferous'])[0],
            'flag' : list(tmp_df['flag'])[0]
        }
        new_df = new_df.append(tmp_dict,ignore_index=True)
    elif len(tmp_df)>1:
        flag = None
        family = None
        common_name = []
        range_state = []
        sub_sp_list = []
        for j, _row in tmp_df.iterrows():
            if type(_row['common_name']) == str :
                cn = _row['common_name'].split(';')
                common_name.extend(cn)
            if type(_row['range_state']) == str :
                rs = _row['range_state'].split(';')
                range_state.extend(rs)
            if type(_row['sub_sp']) == str :
                ssp = _row['sub_sp'].split(';')
                sub_sp_list.extend(ssp)
        
        fam = list(tmp_df['family'])
        con = list(tmp_df['is_coniferous'])
        trp = list(tmp_df['is_tropical'])
        flags = list(tmp_df['flag'])
        
        for _f in fam:
            if type(_f) == str :
                family = _f
                break
        
        res_con = []
        for c in con:
            if type(c) == float and math.isnan(c)== False :
                res_con.append(c)
        if len(res_con) > 0 :
            res_con = max(res_con)
        else :
            res_con = None
            
        res_trp = []
        for t in trp:
            if type(t) == float and math.isnan(t)== False :
                res_con.append(t)
        if len(res_trp) > 0 :
            res_trp = max(res_trp)
        else:
            res_trp =None
        
        
        res_flag = []
        for c in flags:
            if type(c) == float and math.isnan(c)== False :
                res_flag.append(c)
        if len(res_flag) > 0 :
            res_flag = max(res_flag)
        else :
            res_flag = None
        
        if len(common_name) > 0 :
            common_name = list(set(common_name))
            common_name = ';'.join(common_name)
        else:
            common_name = None
        
        if len(range_state) > 0 :
            range_state = list(set(range_state))
            range_state = ';'.join(range_state)
        else:
            range_state = None
        
        if len(sub_sp_list) > 0 :
            sub_sp_list = list(set(sub_sp_list))
            sub_sp_list = ';'.join(sub_sp_list)
        else:
            sub_sp_list = None
        
        tmp_dict ={
            'family'  : family,
            'genus' : scn[0] ,
            'species' : scn[1],
            'sub_sp' : sub_sp_list,
            'common_name' : common_name,
            'range_state': range_state,
            'is_tropical' : res_trp,
            'is_coniferous': res_con,
            'flag' : list(tmp_df['flag'])[0]
        }
        new_df = new_df.append(tmp_dict,ignore_index=True)
        
        
new_df.head(10)

Unnamed: 0,family,genus,species,sub_sp,common_name,range_state,is_tropical,is_coniferous,flag
0,,Abarema,abbottii,,,,,,1.0
1,,Abarema,alexandri,,,,,,0.0
2,,Abarema,bigemina,,,,,,1.0
3,,Abarema,callejasii,,,,,,1.0
4,,Abarema,centiflora,,,,,,1.0
5,,Abarema,cochleata,,,,,,1.0
6,,Abarema,cochliocarpos,,,,,,0.0
7,,Abarema,commutata,,,,,,0.0
8,,Abarema,curvicarpa,,orelha de negro,,,,0.0
9,,Abarema,filamentosa,,,,,,1.0


In [9]:
new_df.to_csv('combined_species_3.csv')