## Pandas

### Import Pandas

Pandas verilerin manipüle edebilmek, üzerlerinde çalışabilmek ve temel veri hazırlama işlemlerini yapabilmek için kullanılan Python kütüphanesidir. Programlarımızın genellikle en üst satırlarında pandas'ın fonksiyonlarını kullanmak için kütüphaneyi içeri aktarmamız gerekir.

In [2]:
import pandas as pd

### Series

In [27]:
#first column is index 
#second column value
sayilar = pd.Series([10, 20, 30, 40 ,50])
sayilar

0    10
1    20
2    30
3    40
4    50
dtype: int64

In [28]:
ulkeler = pd.Series(["Turkiye", "Ispanya", "Almanya", "Franse" ],
                   index = ["birinci", "ikinci", "ucuncu", "dorduncu"])
ulkeler

birinci     Turkiye
ikinci      Ispanya
ucuncu      Almanya
dorduncu     Franse
dtype: object

In [30]:
# Seriye isim verme
ulkeler.name = "Ulke isimleri"
ulkeler

birinci     Turkiye
ikinci      Ispanya
ucuncu      Almanya
dorduncu     Franse
Name: Ulke isimleri, dtype: object

In [31]:
# Serideki değerleri çağırma
ulkeler.values

array(['Turkiye', 'Ispanya', 'Almanya', 'Franse'], dtype=object)

In [32]:
# Indeks ile eleman çağırma
ulkeler["ucuncu"]

'Almanya'

In [33]:
# Indeks eklemenin başka bir yolu
ulkeler.index = [
    0,
    1,
    2,
    3
]
ulkeler

0    Turkiye
1    Ispanya
2    Almanya
3     Franse
Name: Ulke isimleri, dtype: object

In [35]:
# Indekslerle değerleri sözlük yapısı gibi tek satırda yapabiliriz
ulkeler2 = pd.Series({
    "Turkiye": 30.000,
    "Italya": 20.000,
    "Almanya": 25.000
}, name = "Ulkeler ve değerleri")
ulkeler2

Turkiye    30.0
Italya     20.0
Almanya    25.0
Name: Ulkeler ve değerleri, dtype: float64

In [37]:
ulkeler2.keys()

Index(['Turkiye', 'Italya', 'Almanya'], dtype='object')

In [39]:
ulkeler2.values

array([30., 20., 25.])

### Indeksleme

Listeler ve sözlüklerdeki gibi çalışır. Istenilen elemanın indekste yazan değeri girilerek o eleman veya elemanlar çağrılır.

In [40]:
ulkeler = pd.Series({
    "Turkiye": 30.000,
    "Italya": 20.000,
    "Almanya": 25.000,
    "Fransa": 10.000,
    "Kanada": 5.000
}, name = "Ulkeler ve değerleri")
ulkeler

Turkiye    30.0
Italya     20.0
Almanya    25.0
Fransa     10.0
Kanada      5.0
Name: Ulkeler ve değerleri, dtype: float64

In [41]:
ulkeler["Almanya"]

25.0

In [42]:
# Indekslerin de indeksi şeklinde iloc kullanarak çağırabilirsin
# Defaultun normal seriler gibi 0'dan başladığını düşün
ulkeler.iloc[0]

30.0

In [43]:
ulkeler.iloc[-1]

5.0

In [44]:
# Birden fazla elemanı tek seferde çağırabilirsin
ulkeler[["Turkiye", "Almanya"]]

Turkiye    30.0
Almanya    25.0
Name: Ulkeler ve değerleri, dtype: float64

In [45]:
# iloc ile de indeksler ile aynı işlemi yapabilirsin
ulkeler[[0, 2]]

Turkiye    30.0
Almanya    25.0
Name: Ulkeler ve değerleri, dtype: float64

In [46]:
# Aynı şekilde dilimleme yöntemi de kullanılabilir
ulkeler["Almanya": "Kanada"]

Almanya    25.0
Fransa     10.0
Kanada      5.0
Name: Ulkeler ve değerleri, dtype: float64

### Koşullu Seçim

In [47]:
ulkeler

Turkiye    30.0
Italya     20.0
Almanya    25.0
Fransa     10.0
Kanada      5.0
Name: Ulkeler ve değerleri, dtype: float64

In [48]:
ulkeler[ulkeler > 15]

Turkiye    30.0
Italya     20.0
Almanya    25.0
Name: Ulkeler ve değerleri, dtype: float64

In [49]:
ulkeler.mean()

18.0

In [50]:
ulkeler[ulkeler > ulkeler.mean()]

Turkiye    30.0
Italya     20.0
Almanya    25.0
Name: Ulkeler ve değerleri, dtype: float64

### DataFrame

In [51]:
data = {'Country': ['Belgium',  'India',  'Brazil'],
        'Capital': ['Brussels',  'New Delhi',  'Brasilia'],
        'Population': [11190846, 1303171035, 207847528]}

df = pd.DataFrame(data, columns=['Country',  'Capital',  'Population'])

df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasilia,207847528


In [53]:
df.columns

Index(['Country', 'Capital', 'Population'], dtype='object')

In [54]:
df.index

RangeIndex(start=0, stop=3, step=1)

In [55]:
df

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasilia,207847528


In [56]:
df.describe()

Unnamed: 0,Population
count,3.0
mean,507403100.0
std,696134600.0
min,11190850.0
25%,109519200.0
50%,207847500.0
75%,755509300.0
max,1303171000.0


In [57]:
df.T

Unnamed: 0,0,1,2
Country,Belgium,India,Brazil
Capital,Brussels,New Delhi,Brasilia
Population,11190846,1303171035,207847528


In [58]:
df.shape

(3, 3)

In [59]:
df.ndim

2

In [60]:
df.size

9

In [61]:
df.values

array([['Belgium', 'Brussels', 11190846],
       ['India', 'New Delhi', 1303171035],
       ['Brazil', 'Brasilia', 207847528]], dtype=object)

In [62]:
df.head()

Unnamed: 0,Country,Capital,Population
0,Belgium,Brussels,11190846
1,India,New Delhi,1303171035
2,Brazil,Brasilia,207847528


In [63]:
df.tail(1)

Unnamed: 0,Country,Capital,Population
2,Brazil,Brasilia,207847528


In [69]:
df['Population']

0      11190846
1    1303171035
2     207847528
Name: Population, dtype: int64

In [70]:
df.iloc[-1]

Country          Brazil
Capital        Brasilia
Population    207847528
Name: 2, dtype: object

In [81]:
import numpy as np
import pandas as pd
m = np.random.randint(1,30, size = (10,3))
df = pd.DataFrame(m, columns=["var1","var2","var3"])
df

Unnamed: 0,var1,var2,var3
0,18,27,21
1,11,10,6
2,5,27,12
3,22,1,24
4,27,12,4
5,1,19,19
6,10,19,8
7,2,5,25
8,17,12,29
9,10,18,20


In [82]:
df.loc[0:3]

Unnamed: 0,var1,var2,var3
0,18,27,21
1,11,10,6
2,5,27,12
3,22,1,24


In [83]:
df.iloc[0:3]

Unnamed: 0,var1,var2,var3
0,18,27,21
1,11,10,6
2,5,27,12


In [84]:
df.iloc[:3,:2]

Unnamed: 0,var1,var2
0,18,27
1,11,10
2,5,27


In [85]:
df.loc[0:3,"var3"]

0    21
1     6
2    12
3    24
Name: var3, dtype: int64

In [86]:
df.iloc[0:3]["var3"]

0    21
1     6
2    12
Name: var3, dtype: int64

### Koşullu Seçim

In [87]:
df[0:2][["var1","var2"]]

Unnamed: 0,var1,var2
0,18,27
1,11,10


In [88]:
df.var1 > 15

0     True
1    False
2    False
3     True
4     True
5    False
6    False
7    False
8     True
9    False
Name: var1, dtype: bool

In [89]:
df[df.var1 > 15]["var2"]

0    27
3     1
4    12
8    12
Name: var2, dtype: int64

In [90]:
df[(df.var1 >10) & (df.var3 < 8)]

Unnamed: 0,var1,var2,var3
1,11,10,6
4,27,12,4


In [91]:
df

Unnamed: 0,var1,var2,var3
0,18,27,21
1,11,10,6
2,5,27,12
3,22,1,24
4,27,12,4
5,1,19,19
6,10,19,8
7,2,5,25
8,17,12,29
9,10,18,20


In [92]:
df[(df.var1 >10)][["var1","var2"]]

Unnamed: 0,var1,var2
0,18,27
1,11,10
3,22,1
4,27,12
8,17,12


In [93]:
df = pd.DataFrame({
    'Population': [35.467, 63.951, 80.94 , 60.665, 127.061, 64.511, 318.523],
    'GDP': [
        1785387,
        2833687,
        3874437,
        2167744,
        4602367,
        2950039,
        17348075
    ],
    'Surface Area': [
        9984670,
        640679,
        357114,
        301336,
        377930,
        242495,
        9525067
    ],
    'HDI': [
        0.913,
        0.888,
        0.916,
        0.873,
        0.891,
        0.907,
        0.915
    ],
    'Continent': [
        'America',
        'Europe',
        'Europe',
        'Europe',
        'Asia',
        'Europe',
        'America'
    ]
}, columns=['Population', 'GDP', 'Surface Area', 'HDI', 'Continent'])
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [94]:
df['Population']

0     35.467
1     63.951
2     80.940
3     60.665
4    127.061
5     64.511
6    318.523
Name: Population, dtype: float64

In [95]:
df['Population'].to_frame() # Same thing with a good representation -table-

Unnamed: 0,Population
0,35.467
1,63.951
2,80.94
3,60.665
4,127.061
5,64.511
6,318.523


In [96]:
df[['Population', 'GDP']] # Prints this to columns with indexes

Unnamed: 0,Population,GDP
0,35.467,1785387
1,63.951,2833687
2,80.94,3874437
3,60.665,2167744
4,127.061,4602367
5,64.511,2950039
6,318.523,17348075


In [97]:
df[1:3]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe


In [98]:
df[(df["Population"] < 60.000) | (df["GDP"] > 80.000)]

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


### Düşürmek

In [99]:
df

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [100]:
df.drop("GDP", axis = 1)

Unnamed: 0,Population,Surface Area,HDI,Continent
0,35.467,9984670,0.913,America
1,63.951,640679,0.888,Europe
2,80.94,357114,0.916,Europe
3,60.665,301336,0.873,Europe
4,127.061,377930,0.891,Asia
5,64.511,242495,0.907,Europe
6,318.523,9525067,0.915,America


### Sıralamak

In [101]:
df.sort_index()

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
1,63.951,2833687,640679,0.888,Europe
2,80.94,3874437,357114,0.916,Europe
3,60.665,2167744,301336,0.873,Europe
4,127.061,4602367,377930,0.891,Asia
5,64.511,2950039,242495,0.907,Europe
6,318.523,17348075,9525067,0.915,America


In [102]:
df.sort_values(by='GDP')

Unnamed: 0,Population,GDP,Surface Area,HDI,Continent
0,35.467,1785387,9984670,0.913,America
3,60.665,2167744,301336,0.873,Europe
1,63.951,2833687,640679,0.888,Europe
5,64.511,2950039,242495,0.907,Europe
2,80.94,3874437,357114,0.916,Europe
4,127.061,4602367,377930,0.891,Asia
6,318.523,17348075,9525067,0.915,America


### Aggregation & Grouping

* count()
* first()
* last()
* mean()
* median()
* min()
* max()
* std()
* var()
* sum()

In [103]:
#seaborn has a lot of dataset in own itself.
import seaborn as sns

In [105]:
df = sns.load_dataset("planets")
df

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.300000,7.10,77.40,2006
1,Radial Velocity,1,874.774000,2.21,56.95,2008
2,Radial Velocity,1,763.000000,2.60,19.84,2011
3,Radial Velocity,1,326.030000,19.40,110.62,2007
4,Radial Velocity,1,516.220000,10.50,119.47,2009
...,...,...,...,...,...,...
1030,Transit,1,3.941507,,172.00,2006
1031,Transit,1,2.615864,,148.00,2007
1032,Transit,1,3.191524,,174.00,2007
1033,Transit,1,4.125083,,293.00,2008


In [106]:
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [107]:
df.shape

(1035, 6)

In [108]:
df.mean()

number               1.785507
orbital_period    2002.917596
mass                 2.638161
distance           264.069282
year              2009.070531
dtype: float64

In [109]:
df["mass"]

0        7.10
1        2.21
2        2.60
3       19.40
4       10.50
        ...  
1030      NaN
1031      NaN
1032      NaN
1033      NaN
1034      NaN
Name: mass, Length: 1035, dtype: float64

In [110]:
df['mass'].mean()

2.6381605847953233

In [111]:
df["mass"].count()

513

In [112]:
df["mass"].min()

0.0036

In [113]:
df["mass"].max()

25.0

In [114]:
df["mass"].sum()

1353.37638

In [115]:
df["mass"].std()

3.8186166509616046

In [116]:
#variance
df["mass"].var()

14.58183312700122

In [117]:
df.describe()

Unnamed: 0,number,orbital_period,mass,distance,year
count,1035.0,992.0,513.0,808.0,1035.0
mean,1.785507,2002.917596,2.638161,264.069282,2009.070531
std,1.240976,26014.728304,3.818617,733.116493,3.972567
min,1.0,0.090706,0.0036,1.35,1989.0
25%,1.0,5.44254,0.229,32.56,2007.0
50%,1.0,39.9795,1.26,55.25,2010.0
75%,2.0,526.005,3.04,178.5,2012.0
max,7.0,730000.0,25.0,8500.0,2014.0


### GroupBy

In [118]:
df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                   'veri': [10,11,52,23,43,55]}, columns = ['gruplar','veri'])
df

Unnamed: 0,gruplar,veri
0,A,10
1,B,11
2,C,52
3,A,23
4,B,43
5,C,55


In [119]:
df.groupby("gruplar")

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x7fe05aa240d0>

In [120]:
df.groupby("gruplar").mean()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,16.5
B,27.0
C,53.5


In [121]:
df.groupby("gruplar").sum()

Unnamed: 0_level_0,veri
gruplar,Unnamed: 1_level_1
A,33
B,54
C,107


In [122]:
df = sns.load_dataset("planets")
df.head()

Unnamed: 0,method,number,orbital_period,mass,distance,year
0,Radial Velocity,1,269.3,7.1,77.4,2006
1,Radial Velocity,1,874.774,2.21,56.95,2008
2,Radial Velocity,1,763.0,2.6,19.84,2011
3,Radial Velocity,1,326.03,19.4,110.62,2007
4,Radial Velocity,1,516.22,10.5,119.47,2009


In [123]:
df.groupby("method")["orbital_period"].mean()

method
Astrometry                          631.180000
Eclipse Timing Variations          4751.644444
Imaging                          118247.737500
Microlensing                       3153.571429
Orbital Brightness Modulation         0.709307
Pulsar Timing                      7343.021201
Pulsation Timing Variations        1170.000000
Radial Velocity                     823.354680
Transit                              21.102073
Transit Timing Variations            79.783500
Name: orbital_period, dtype: float64

In [124]:
df.groupby("method")["orbital_period"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Astrometry,2.0,631.18,544.217663,246.36,438.77,631.18,823.59,1016.0
Eclipse Timing Variations,9.0,4751.644444,2499.130945,1916.25,2900.0,4343.5,5767.0,10220.0
Imaging,12.0,118247.7375,213978.177277,4639.15,8343.9,27500.0,94250.0,730000.0
Microlensing,7.0,3153.571429,1113.166333,1825.0,2375.0,3300.0,3550.0,5100.0
Orbital Brightness Modulation,3.0,0.709307,0.725493,0.240104,0.291496,0.342887,0.943908,1.544929
Pulsar Timing,5.0,7343.021201,16313.265573,0.090706,25.262,66.5419,98.2114,36525.0
Pulsation Timing Variations,1.0,1170.0,,1170.0,1170.0,1170.0,1170.0,1170.0
Radial Velocity,553.0,823.35468,1454.92621,0.73654,38.021,360.2,982.0,17337.5
Transit,397.0,21.102073,46.185893,0.355,3.16063,5.714932,16.1457,331.60059
Transit Timing Variations,3.0,79.7835,71.599884,22.3395,39.67525,57.011,108.5055,160.0


### Aggregation

In [125]:
import numpy as np
import pandas as pd
import seaborn as sns

df = pd.DataFrame({'gruplar': ['A','B','C','A','B','C'],
                   'değişken1': [10,23,33,22,11,99],
                   'değişken2':[100,253,333,262,111,969]},
                  columns = ['gruplar','değişken1','değişken2'])
df

Unnamed: 0,gruplar,değişken1,değişken2
0,A,10,100
1,B,23,253
2,C,33,333
3,A,22,262
4,B,11,111
5,C,99,969


In [126]:
# aggregate

df.groupby('gruplar').mean()

Unnamed: 0_level_0,değişken1,değişken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,16,181
B,17,182
C,66,651


In [127]:
df.groupby('gruplar').aggregate([min, np.median, max])

Unnamed: 0_level_0,değişken1,değişken1,değişken1,değişken2,değişken2,değişken2
Unnamed: 0_level_1,min,median,max,min,median,max
gruplar,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
A,10,16,22,100,181,262
B,11,17,23,111,182,253
C,33,66,99,333,651,969


In [128]:
df.groupby('gruplar').aggregate({"değişken1":"min","değişken2": "max"})

Unnamed: 0_level_0,değişken1,değişken2
gruplar,Unnamed: 1_level_1,Unnamed: 2_level_1
A,10,262
B,11,253
C,33,969
