    # VeriKaynağı: https://archive.ics.uci.edu/ml/datasets/online+retail
    # Orijinal veri dosyası üzerinde küçük temizlikler yapılmıştır.
    # Bu yüzden Github repoda bulunan OnlineRetail.csv kaynakta bulunan ile küçük farklılıklar gösterebilir. Orijinal veri seti OnlineRetailDirty.xlsx dir.
    
    
    NİTELİKLER VE AÇIKLAMALARI
    # InvoiceNo: Invoice number. Nominal, a 6-digit integral number uniquely assigned to each transaction. If this code starts with letter 'c', it indicates a cancellation. 
    # StockCode: Product (item) code. Nominal, a 5-digit integral number uniquely assigned to each distinct product. 
    # Description: Product (item) name. Nominal. 
    # Quantity: The quantities of each product (item) per transaction. Numeric.	
    # InvoiceDate: Invice Date and time. Numeric, the day and time when each transaction was generated. 
    # UnitPrice: Unit price. Numeric, Product price per unit in sterling. 
    # CustomerID: Customer number. Nominal, a 5-digit integral number uniquely assigned to each customer. 
    # Country: Country name. Nominal, the name of the country where each customer resides.

In [1]:
import pandas as pd

In [2]:
data_path = "D:\\Datasets\\"
data_set = "OnlineRetail2.csv"

In [3]:
df = pd.read_csv(data_path + data_set, sep=",")

In [4]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


# Pandas Dataframe Örneklem (Sample)

In [5]:
df_sample = df['InvoiceNo'].sample(n=100, random_state=142)
df_sample.head()

128745    547359
284670    561882
399875    571289
352570    567673
194586    553657
Name: InvoiceNo, dtype: object

In [6]:
print(len(df))
print(len(df_sample))

541909
100


# Dataframe Bilfileri (info ve describe)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null object
UnitPrice      541909 non-null float64
CustomerID     406829 non-null float64
Country        541909 non-null object
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [8]:
df.describe()

Unnamed: 0,Quantity,UnitPrice,CustomerID
count,541909.0,541909.0,406829.0
mean,9.55225,4.611114,15287.69057
std,218.081158,96.759853,1713.600303
min,-80995.0,-11062.06,12346.0
25%,1.0,1.25,13953.0
50%,3.0,2.08,15152.0
75%,10.0,4.13,16791.0
max,80995.0,38970.0,18287.0


# astype() Pandas Dataframe tür dönüşümü

In [9]:
# CustomerID float64, onu string yapalım
df['CustomerID'] = df['CustomerID'].astype('str')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
InvoiceNo      541909 non-null object
StockCode      541909 non-null object
Description    540455 non-null object
Quantity       541909 non-null int64
InvoiceDate    541909 non-null object
UnitPrice      541909 non-null float64
CustomerID     541909 non-null object
Country        541909 non-null object
dtypes: float64(1), int64(1), object(6)
memory usage: 33.1+ MB


# Dataframe satır sayısı

In [11]:
len(df)

541909

In [12]:
# count ile farklı değerlerin çıkması null değer bulunduğunu gösteriyor.
# Description içinde null var.

# Dataframe içinde null kontrolü

In [12]:
df.isnull().values.any()

True

In [12]:
# Bunu vertabanı tablosunda bu kadar kolay yapmak mümkün mü? Niçin?

In [13]:
df.isnull().sum()

InvoiceNo         0
StockCode         0
Description    1454
Quantity          0
InvoiceDate       0
UnitPrice         0
CustomerID        0
Country           0
dtype: int64

# Series içinde null kontrolü

In [15]:
df['InvoiceNo'].isnull().values.any()

False

# Null değerleri düşürme

In [14]:
# Dikkat! Bunu yaptıktan sonra verinin eski haline dönmek için yeniden okutmalısınız.
df = df.dropna(inplace=False)

In [15]:
df.count()

InvoiceNo      540455
StockCode      540455
Description    540455
Quantity       540455
InvoiceDate    540455
UnitPrice      540455
CustomerID     540455
Country        540455
dtype: int64

In [18]:
# Bütün satır sayıları eşit oldu. En düşüksatır sayısına indi.

# Sütunları Adıyla Seçmek

In [16]:
df[["InvoiceNo", "StockCode"]].head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


# Sütunları Indeks Numaraları ile Seçmek

In [17]:
df.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom


In [21]:
# iloc[] metodu ile satır ve sütunları indeks bazlı seçebiliriz

In [18]:
# Sadece belirli sütunları seçmek için liste içinde indeks numaralarını vermek gerekir
# [satırlar, sütunlar]
# Tüm satırları ve 1. ve 3. inkdekse sahip sütunları seç
df.iloc[:,[1,3]].head()

Unnamed: 0,StockCode,Quantity
0,85123A,6
1,71053,6
2,84406B,8
3,84029G,6
4,84029E,6


In [19]:
# Tüm satırları ve son sütuna kadar tüm sütunları seç
df.iloc[:,:-1].head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0


In [24]:
# loc metodu ile sütun isimlerini kullanarak seçim yapabiliriz.

In [20]:
# Tümsatırları ve 'InvoiceNo','StockCode' sütunlarını seç
df.loc[:,['InvoiceNo','StockCode']].head()

Unnamed: 0,InvoiceNo,StockCode
0,536365,85123A
1,536365,71053
2,536365,84406B
3,536365,84029G
4,536365,84029E


# Satırları Indeks Numaraları ile Seçmek

In [21]:
# 0'dan 20'ye kadar olan indeksli satırları ve tüm sütunları seç
df.iloc[0:20,:].head(30)

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850.0,United Kingdom
5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850.0,United Kingdom
6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850.0,United Kingdom
7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
8,536366,22632,HAND WARMER RED POLKA DOT,6,2010-12-01 08:28:00,1.85,17850.0,United Kingdom
9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047.0,United Kingdom


# Indeks numarasıyla seçip yeni bir dataframe oluşturma

In [22]:
# Tüm satırlar ve 0'dan 2. indekse kadar olan sütunları seç ve bir df'e ata
iloc_df = df.iloc[:, 0:2]

In [23]:
type(iloc_df)

pandas.core.frame.DataFrame

In [24]:
iloc_df.shape

(540455, 2)

# Pandas Dataframe'den numpy ndarray elde etmek

In [25]:
iloc_npa = df.iloc[:, 0:2].values

In [26]:
type(iloc_npa)

numpy.ndarray

In [32]:
iloc_npa.shape

(540455, 2)