# Pandas Tutorial

## Import

In [1]:
import pandas as pd

## Pandas Data Structures

In [2]:
data_1d=pd.Series(['Turkey','Germany','Netherlands','Belgium','Portugal'])
# pandas Series -> one-dimensional array with indexes
data_1d

0         Turkey
1        Germany
2    Netherlands
3        Belgium
4       Portugal
dtype: object

In [4]:
data_2d={'Country':['Turkey','Germany','Netherlands','Belgium','Portugal'],
      'Population':[83614362, 83190556, 17134923, 14800000, 10276617]}
# We defined a 2-dimensional variable named data_2d of type dict
df=pd.DataFrame(data_2d,columns=['Country','Population']) # converting the type of data_2d to DataFrame
df

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923
3,Belgium,14800000
4,Portugal,10276617


## Basic Operations

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Country     5 non-null      object
 1   Population  5 non-null      int64 
dtypes: int64(1), object(1)
memory usage: 208.0+ bytes


In [6]:
df.shape # output is (5,2) . This means we have 5 rows 2 columns on df

(5, 2)

In [7]:
df.head(3) # It prints first 3 rows on df.

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923


In [8]:
df.tail() # It prints last few rows. (It printed all rows because of we have just few rows on df)

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923
3,Belgium,14800000
4,Portugal,10276617


In [9]:
df.dtypes # It prints types of Columns

Country       object
Population     int64
dtype: object

In [10]:
df[2:5]   # It prints 2nd 3rd 4th rows (index starts from 0).

Unnamed: 0,Country,Population
2,Netherlands,17134923
3,Belgium,14800000
4,Portugal,10276617


In [11]:
df.columns #It prints the Names of Columns (features) 

Index(['Country', 'Population'], dtype='object')

In [12]:
df.Country # It prints the data in the Country column for all rows.
# df['Country'] It does the same thing 

0         Turkey
1        Germany
2    Netherlands
3        Belgium
4       Portugal
Name: Country, dtype: object

In [13]:
df [ [ 'Country', 'Population']] # It just prints these 2 features as columns. (It is useful for bigger datasets)

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923
3,Belgium,14800000
4,Portugal,10276617


In [15]:
df['Population'].max() # maximum population

83614362

In [16]:
df['Population'].min() # minimum population

10276617

In [18]:
df.describe()       # Informations about dataframe (max,min,mean, %25 percentile, etc.)

Unnamed: 0,Population
count,5.0
mean,41803290.0
std,38054920.0
min,10276620.0
25%,14800000.0
50%,17134920.0
75%,83190560.0
max,83614360.0


In [19]:
df[df['Population']>20000000] # like SQL query, It prints lines where the population is more than 20 million

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556


In [20]:
df[df['Population'] ==df['Population'].max() ] 

Unnamed: 0,Country,Population
0,Turkey,83614362


In [21]:
df.loc[0] # It prints lines where the index is 0

Country         Turkey
Population    83614362
Name: 0, dtype: object

In [22]:
df.set_index('Country',inplace=True)
df
# It sets the index as country. We can use loc to calling by Country
#With the inplace=True we assigned the new variable in place of the old variable

Unnamed: 0_level_0,Population
Country,Unnamed: 1_level_1
Turkey,83614362
Germany,83190556
Netherlands,17134923
Belgium,14800000
Portugal,10276617


In [23]:
df.loc['Germany'] # It prints the line where the index is Germany

Population    83190556
Name: Germany, dtype: int64

In [24]:
df.reset_index(inplace=True) # We restored the index
df

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923
3,Belgium,14800000
4,Portugal,10276617


## Read and Write CSV file

In [25]:
df.to_csv("countries.csv",index=False,columns=['Country','Population']) # Writing DataFrame to CSV file.
df=pd.read_csv("countries.csv") # Read csv file and save as df
print(type(df))
#df=pd.read_excel("filename.csv","sheet1")
#df=pd.DataFrame(pythonDict)                # Read python dictionary as dataframe 
#df=pd.DataFrame(tupleslist, columns=["Country","Population"])# list of tuples to DataFrame
#df=pd.DataFrame(pythonDictList)            # python dict list to DataFrame

<class 'pandas.core.frame.DataFrame'>


In [26]:
df

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923
3,Belgium,14800000
4,Portugal,10276617


In [27]:
df=pd.read_csv("countries.csv", header=None, names=["Ulke","Nüfus"]) 
# We specified that there are no headers in the CSV file and we entered the headers ourselves.

In [28]:
df2=pd.read_csv("countries.csv", nrows=3) # Read only the first 3 lines
df2

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923


## Missing Values

There will sometimes be missing data in our dataset. How to deal with them is an important problem.

In [30]:
df_missing=pd.read_csv("countries_missing.csv") 
df_missing

Unnamed: 0,Country,Population
0,Turkey,83614362
1,Germany,83190556
2,Netherlands,17134923
3,Belgium,14200000
4,Portugal,10276617
5,Japan,
6,Egypt,Not Available
7,Italy,59257566
8,Spain,47351567
9,Argentina,-1


In [31]:
df_missing.isnull().sum() #Checking NaN Values

Country       0
Population    0
dtype: int64

In [34]:
df=pd.read_csv("countries_missing.csv", na_values=["Not Available", "N.A","NULL","null"," "])
# Count all 5 values above as NaN
df.isnull().sum()

Country       0
Population    2
dtype: int64

In [35]:
df=pd.read_csv("countries_missing.csv", na_values={ # Population can't be -1, count as NaN
  'Population':["Not Available", "N.A"," ",-1],
  'Country':["Not Available", "N.A"]
})
df.isnull().sum()

Country       0
Population    3
dtype: int64

In [36]:
new_df=df.fillna(0)            # We fill NaN values with 0
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,0.0
6,Egypt,0.0
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,0.0


In [37]:
df.Population.mean()

45003655.85714286

In [39]:
new_df=df.fillna({             # We wrote what will come in place of NaN specifically for each line.
        'Country' : 'Empty',
        'Population':df.Population.mean()
})
new_df

Unnamed: 0,Country,Population
0,Turkey,83614360.0
1,Germany,83190560.0
2,Netherlands,17134920.0
3,Belgium,14200000.0
4,Portugal,10276620.0
5,Japan,45003660.0
6,Egypt,45003660.0
7,Italy,59257570.0
8,Spain,47351570.0
9,Argentina,45003660.0


In [40]:
df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,
6,Egypt,
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,


In [41]:
new_df=df.fillna(method="ffill") #With forward fill, it takes the values above itself
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,10276617.0
6,Egypt,10276617.0
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,47351567.0


In [42]:
new_df=df.fillna(method="bfill") #With backward fill it takes the following values by itself
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,59257566.0
6,Egypt,59257566.0
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,


In [43]:
new_df=df.fillna(method="bfill",axis="columns") # with backward fill columns it takes from the its right columns
# It wont take any value because there is no value on its right column
new_df

Unnamed: 0,Country,Population
0,Turkey,83614400.0
1,Germany,83190600.0
2,Netherlands,17134900.0
3,Belgium,14200000.0
4,Portugal,10276600.0
5,Japan,
6,Egypt,
7,Italy,59257600.0
8,Spain,47351600.0
9,Argentina,


In [44]:
new_df=df.fillna(method="ffill",axis="columns") # with forward fill columns it takes from the its left columns
new_df

Unnamed: 0,Country,Population
0,Turkey,8.36144e+07
1,Germany,8.31906e+07
2,Netherlands,1.71349e+07
3,Belgium,1.42e+07
4,Portugal,1.02766e+07
5,Japan,Japan
6,Egypt,Egypt
7,Italy,5.92576e+07
8,Spain,4.73516e+07
9,Argentina,Argentina


In [46]:
new_df=df.fillna(method="bfill", limit=1) #With backward fill it takes the following values by itself
# just for 1 value, not iterative
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,
6,Egypt,59257566.0
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,


In [47]:
df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,
6,Egypt,
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,


In [48]:
new_df=df.interpolate()          # Interpolates for nan values (calculates intermediate value)
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,26603600.0
6,Egypt,42930583.0
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,47351567.0


In [49]:
new_df=df.dropna()  # all rows with nan values are deleted
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
7,Italy,59257566.0
8,Spain,47351567.0


In [50]:
new_df=df.dropna(how="all")      
# Deletes the row if all columns are NaN, if there is even 1 valid value, the row is not deleted
# Rows whose Country and Population values are blank at the same time are deleted (no such rows in the dataset)
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
5,Japan,
6,Egypt,
7,Italy,59257566.0
8,Spain,47351567.0
9,Argentina,


In [51]:
new_df=df.dropna(thresh=2)       # If there are at least 2 valid values, the row is not deleted.
new_df

Unnamed: 0,Country,Population
0,Turkey,83614362.0
1,Germany,83190556.0
2,Netherlands,17134923.0
3,Belgium,14200000.0
4,Portugal,10276617.0
7,Italy,59257566.0
8,Spain,47351567.0


In some datasets, there may be values such as -99999 instead of NaN in missing values. How can we deal with it?

In [52]:
import numpy as np
df_notes=pd.read_csv("notes.csv")
df_notes

Unnamed: 0,Student,Score
0,Adam,very good
1,Harold,excellent
2,Barry,average
3,Tom,good
4,Paul,good
5,Samet,average
6,Mary,poor
7,Linda,-99999
8,-99999,average
9,Elizabeth,


In [54]:
df_notes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Student  10 non-null     object
 1   Score    9 non-null      object
dtypes: object(2)
memory usage: 288.0+ bytes


In [55]:
new_df=df_notes.replace('-99999', np.NaN)          #It Writes NaN instead of -99999
new_df

Unnamed: 0,Student,Score
0,Adam,very good
1,Harold,excellent
2,Barry,average
3,Tom,good
4,Paul,good
5,Samet,average
6,Mary,poor
7,Linda,
8,,average
9,Elizabeth,


In [56]:
new_df.isnull().sum() # Checking NaN values

Student    1
Score      2
dtype: int64

In [57]:
new_df=df_notes.replace(['-99999','-1',' '], np.NaN) #It Writes NaN instead of -99999, -1 and space
new_df

Unnamed: 0,Student,Score
0,Adam,very good
1,Harold,excellent
2,Barry,average
3,Tom,good
4,Paul,good
5,Samet,average
6,Mary,poor
7,Linda,
8,,average
9,Elizabeth,


In [58]:
new_df=df_notes.replace(['poor', 'average', 'good', 'very good','excellent'], [1,2,3,4,5])
new_df

Unnamed: 0,Student,Score
0,Adam,4.0
1,Harold,5.0
2,Barry,2.0
3,Tom,3.0
4,Paul,3.0
5,Samet,2.0
6,Mary,1.0
7,Linda,-99999.0
8,-99999,2.0
9,Elizabeth,


In [59]:
new_df=new_df.replace({                        # Write replace specifically for each column
        'Student': '-99999',
        'Score': ['-99999','-1',' ']
},np.NaN)
new_df

Unnamed: 0,Student,Score
0,Adam,4.0
1,Harold,5.0
2,Barry,2.0
3,Tom,3.0
4,Paul,3.0
5,Samet,2.0
6,Mary,1.0
7,Linda,
8,,2.0
9,Elizabeth,


In [60]:
new_df=new_df.fillna({             # We wrote what will come in place of NaN specifically for each line.
        'Student' : 'Empty',
        'Score':new_df['Score'].mean()
})
new_df

Unnamed: 0,Student,Score
0,Adam,4.0
1,Harold,5.0
2,Barry,2.0
3,Tom,3.0
4,Paul,3.0
5,Samet,2.0
6,Mary,1.0
7,Linda,2.75
8,Empty,2.0
9,Elizabeth,2.75
