# Pandas Data Type

<img src="pandas_dtype.jpg" style="width:350px;height:220px"/>

<img src="method_convert_dtype.jpg" style="width:350px;height:220px"/>

In [1]:
import pandas as pd

In [2]:
weather = pd.read_csv('weather.csv')
weather

Unnamed: 0,date,temperature_high,temperature_low,rained,snowed,overcast,comments
0,2021-01-01,4.0,1,1,False,cloudy,happy new year
1,2021-01-02,11.0,2,0,False,sunny,second day
2,2021-01-03,3.0,2,0,False,foggy,third day
3,2021-01-04,6.0,2,0,False,sunny,first business day
4,2021-01-05,4.0,unknown,0,False,cloudy,second business day


In [3]:
weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   date              5 non-null      object 
 1   temperature_high  5 non-null      float64
 2   temperature_low   5 non-null      object 
 3   rained            5 non-null      int64  
 4   snowed            5 non-null      bool   
 5   overcast          5 non-null      object 
 6   comments          5 non-null      object 
dtypes: bool(1), float64(1), int64(1), object(4)
memory usage: 373.0+ bytes


In [4]:
weather.dtypes

date                 object
temperature_high    float64
temperature_low      object
rained                int64
snowed                 bool
overcast             object
comments             object
dtype: object

In [5]:
weather['temperature_high']

0     4.0
1    11.0
2     3.0
3     6.0
4     4.0
Name: temperature_high, dtype: float64

In [6]:
weather['temperature_high'].astype('int8')

0     4
1    11
2     3
3     6
4     4
Name: temperature_high, dtype: int8

In [7]:
weather.dtypes

date                 object
temperature_high    float64
temperature_low      object
rained                int64
snowed                 bool
overcast             object
comments             object
dtype: object

In [8]:
weather['temperature_high'] = weather['temperature_high'].astype('int8')

In [9]:
weather.dtypes

date                object
temperature_high      int8
temperature_low     object
rained               int64
snowed                bool
overcast            object
comments            object
dtype: object

In [10]:
weather['rained'] = weather['rained'].astype('bool') 

In [11]:
weather['rained']

0     True
1    False
2    False
3    False
4    False
Name: rained, dtype: bool

## Dictionaries method

In [12]:
new_weather = weather.astype({'overcast':'category','comments':'string'})
new_weather

Unnamed: 0,date,temperature_high,temperature_low,rained,snowed,overcast,comments
0,2021-01-01,4,1,True,False,cloudy,happy new year
1,2021-01-02,11,2,False,False,sunny,second day
2,2021-01-03,3,2,False,False,foggy,third day
3,2021-01-04,6,2,False,False,sunny,first business day
4,2021-01-05,4,unknown,False,False,cloudy,second business day


In [13]:
new_weather['overcast']

0    cloudy
1     sunny
2     foggy
3     sunny
4    cloudy
Name: overcast, dtype: category
Categories (3, object): ['cloudy', 'foggy', 'sunny']

In [14]:
new_weather['comments']

0         happy new year
1             second day
2              third day
3     first business day
4    second business day
Name: comments, dtype: string

In [15]:
new_weather.dtypes

date                  object
temperature_high        int8
temperature_low       object
rained                  bool
snowed                  bool
overcast            category
comments              string
dtype: object

In [16]:
# new_weather['temperature_low'] = new_weather['temperature_low'].astype('int8')

In [17]:
new_weather['temperature_low'] = pd.to_numeric(new_weather['temperature_low'],errors='coerce')

In [18]:
new_weather['temperature_low']

0    1.0
1    2.0
2    2.0
3    2.0
4    NaN
Name: temperature_low, dtype: float64

In [19]:
# new_weather['date'] = new_weather['date'].astype('daterime64')
new_weather['date'] = pd.to_datetime(new_weather['date'])

In [20]:
new_weather['date']

0   2021-01-01
1   2021-01-02
2   2021-01-03
3   2021-01-04
4   2021-01-05
Name: date, dtype: datetime64[ns]

In [21]:
new_weather

Unnamed: 0,date,temperature_high,temperature_low,rained,snowed,overcast,comments
0,2021-01-01,4,1.0,True,False,cloudy,happy new year
1,2021-01-02,11,2.0,False,False,sunny,second day
2,2021-01-03,3,2.0,False,False,foggy,third day
3,2021-01-04,6,2.0,False,False,sunny,first business day
4,2021-01-05,4,,False,False,cloudy,second business day


In [22]:
new_weather.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 7 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   date              5 non-null      datetime64[ns]
 1   temperature_high  5 non-null      int8          
 2   temperature_low   4 non-null      float64       
 3   rained            5 non-null      bool          
 4   snowed            5 non-null      bool          
 5   overcast          5 non-null      category      
 6   comments          5 non-null      string        
dtypes: bool(2), category(1), datetime64[ns](1), float64(1), int8(1), string(1)
memory usage: 400.0 bytes


# Load Data

In [23]:
hero_power = pd.read_csv('superhero_powers.csv')
marvel_hero = pd.read_excel('superhero_info.xlsx',sheet_name='Marvel Comics')
dc_hero = pd.read_excel('superhero_info.xlsx',sheet_name='DC Comics')

In [24]:
hero_power

Unnamed: 0,hero_names,Agility,Accelerated Healing,Lantern Power Ring,Dimensional Awareness,Cold Resistance,Durability,Stealth,Energy Absorption,Flight,...,Web Creation,Reality Warping,Odin Force,Symbiote Costume,Speed Force,Phoenix Force,Molecular Dissipation,Vision - Cryo,Omnipresent,Omniscient
0,3-D Man,True,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,A-Bomb,False,True,False,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,Abe Sapien,True,True,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3,Abin Sur,False,False,True,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,Abomination,False,True,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
662,Yellowjacket II,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
663,Ymir,False,False,False,False,True,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
664,Yoda,True,False,False,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
665,Zatanna,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [25]:
hero_power.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 168 columns):
 #    Column                        Dtype 
---   ------                        ----- 
 0    hero_names                    object
 1    Agility                       bool  
 2    Accelerated Healing           bool  
 3    Lantern Power Ring            bool  
 4    Dimensional Awareness         bool  
 5    Cold Resistance               bool  
 6    Durability                    bool  
 7    Stealth                       bool  
 8    Energy Absorption             bool  
 9    Flight                        bool  
 10   Danger Sense                  bool  
 11   Underwater breathing          bool  
 12   Marksmanship                  bool  
 13   Weapons Master                bool  
 14   Power Augmentation            bool  
 15   Animal Attributes             bool  
 16   Longevity                     bool  
 17   Intelligence                  bool  
 18   Super Strength              

In [26]:
hero_power['hero_names'] = hero_power['hero_names'].astype('string')

In [27]:
hero_power.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 667 entries, 0 to 666
Data columns (total 168 columns):
 #    Column                        Dtype 
---   ------                        ----- 
 0    hero_names                    string
 1    Agility                       bool  
 2    Accelerated Healing           bool  
 3    Lantern Power Ring            bool  
 4    Dimensional Awareness         bool  
 5    Cold Resistance               bool  
 6    Durability                    bool  
 7    Stealth                       bool  
 8    Energy Absorption             bool  
 9    Flight                        bool  
 10   Danger Sense                  bool  
 11   Underwater breathing          bool  
 12   Marksmanship                  bool  
 13   Weapons Master                bool  
 14   Power Augmentation            bool  
 15   Animal Attributes             bool  
 16   Longevity                     bool  
 17   Intelligence                  bool  
 18   Super Strength              

## Specify the Dtype when load data

In [28]:
hero_power = pd.read_csv('superhero_powers.csv',dtype={'hero_names':'string'})

In [29]:
hero_power['hero_names']

0              3-D Man
1               A-Bomb
2           Abe Sapien
3             Abin Sur
4          Abomination
            ...       
662    Yellowjacket II
663               Ymir
664               Yoda
665            Zatanna
666               Zoom
Name: hero_names, Length: 667, dtype: string

In [30]:
dc_hero.dtypes

name           object
Gender         object
Eye color      object
Race           object
Hair color     object
Height          int64
Publisher      object
Alignment      object
Weight        float64
dtype: object

In [31]:
hero_dtype = {'name':'string',
              'Gender': 'category',
              'Eye Color': 'string',
              'Race': 'string',
              'Hair Color': 'string',
              'Publisher': 'string',
              'Alignment': 'category'
              }

In [32]:
marvel_hero = pd.read_excel('superhero_info.xlsx',sheet_name='Marvel Comics',dtype=hero_dtype)
dc_hero = pd.read_excel('superhero_info.xlsx',sheet_name='DC Comics',dtype=hero_dtype)


In [33]:
dc_hero.dtypes

name            string
Gender        category
Eye color       object
Race            string
Hair color      object
Height           int64
Publisher       string
Alignment     category
Weight         float64
dtype: object

In [34]:
marvel_hero.dtypes

name            string
Gender        category
Eye color       object
Race            string
Hair color      object
Height           int64
Publisher       string
Alignment     category
Weight           int64
dtype: object

In [36]:
hero_power.to_pickle('Hero Power.pkl')