# Jupyter Pandas Cheat Sheet

In [0]:
import pandas as pd
import numpy as np


In [1]:
import pandas as pd

df = pd.read_csv('Data/IMDB_Movie_Data.csv' ,sep=",") 

## Exploring Data

In [None]:
df.head()       # first five rows

In [0]:
df.tail()       # last five rows

In [0]:
df.sample(5)    # random sample of rows

In [0]:
df.shape        # number of rows/columns 

In [0]:
df.describe()   # calculates measures of central tendency

In [None]:
df.info()       # memory footprint and datatypes

## Statistics

In [0]:
df.describe() # Summary statistics for numerical columns

In [0]:
df.mean() # Returns the mean of all columns

In [0]:
df.corr() # Returns the correlation between columns in a DataFrame

In [0]:
df.count() # Returns the number of non-null values in each DataFrame column

In [0]:
df.max()  # Returns the highest value in each column

In [0]:
df.min() # Returns the lowest value in each column

In [0]:
df.median() # Returns the median of each column

In [0]:
df.std() # Returns the standard deviation of each column

## Import Packages

In [0]:
from google.colab import data_table
data_table.DataTable(df_conf)

## Import Data

### Mount Google Drive

In [0]:
from google.colab import drive
drive.mount('/content/drive',force_remount=False)

### Import csv

In [0]:
df = pd.read_csv('Data/my-data.csv' ,sep=",") 
df

### Import xls

In [0]:
df = pd.read_excel('Data/my-data.xlsx')

In [0]:
df = pd.read_excel('Data/my-data.xlsx',
    sheetname='sheet1',
    skiprows=[1] # header data
)

### Import List Files

In [0]:
files = os.listdir('Data')

dffiles=[]
for f in files:
    dfinput = pd.read_csv('Data/my-data.csv' ,sep=",") 
    dffiles.append(dfinput)
    
dfinput   

### Table Creator

### Import MySQL

In [0]:
import pymysql

conn = pymysql.connect(host='10.144.16.147',port=3306, db='b2b',user='sme',password='dmsme')

df = pd.read_sql_query(
"SELECT * FROM b2b_rollen LIMIT 3;",
    conn)
df.tail(100)

### Import Teradata

In [0]:
import teradata
 
#Make a connection
session = udaExec.connect(method="odbc",
                          USEREGIONALSETTINGS="N",
                          system= "dwhpprd",
                          username = "dwh_pprd_ex_SPS",
                          password = "zIQlt$$7jB8u");  # Um das Dollarzeichen im Passwort zu benutzen, braucht es 2 Dollarzeichen im String!
query = "SELECT * FROM DATABASEX.TABLENAMEX"
#Reading query to df
df = pd.read_sql(query,session)
# do something with df,e.g.
print(df.head()) #to see the first 5 rows

### Import SAP-Hana

In [0]:
import pyhdb
 
connection = pyhdb.connect(
    host="sg006134.corproot.net",
    port=30015,
    user="HDB_REPORT_SHOP_CO_OUT",
    password="INts%8\[Sw.N"
)
print(connection.isconnected())
query = "SELECT * FROM HDB_REPORT_SHOP_CO_OUT.\"01_SWISSCOM.SC_REPORT.SHOP_CO_OUT::pex_sd_faktura\""
df = pd.read_sql(query,connection)
# do something with df,e.g.
print(df.head()) #to see the first 5 rows

## Export Data

### Export xls

In [0]:
df.to_excel('Data/my-data_V2.xlsx')

### Export csv

In [0]:
df.to_csv ('Data/my-data_V2.csv' , index = False, header=True) 

## Column manipulation

### Column Filter

In [0]:
df[['Title','Rating']]

In [0]:
df.filter(['Title','Rating'])

### Column Rename

In [0]:
df.rename(columns={'Title': 'a', 'Rating': 'c'},inplace=True))

### Column Resorter/Reorder

In [0]:
# show column values
df.columns.values

In [0]:
# reoorder Rating after Title
df[['Title', 'Rating','Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)',  'Votes', 'Revenue (Millions)',
       'Metascore']]

### Constant Value Column

In [0]:
df['new_column'] = 23
df.head()

### Math Formula

In [0]:
df['Rating_Votes'] = df.Rating + df.Votes
df[['Rating_Votes','Rating','Votes']].head()

### Number to String

In [0]:
df['Year_str'] =df['Year'].astype(str)
df.info()

### String to Number

In [0]:
df['Year_int'] =df['Year_str'].astype(int)
df.info()

### Double to Int

In [0]:
df['Rating_int'] = df['Rating'].round(0).astype(int)
df[['Rating_int','Rating']].head()

### String Replacer

In [0]:
df['Title'].replace('Prometheus', 'Alien') 


### String Manipulate

In [0]:
# lower
df['Title2'] = df['Title'].str.lower()
df[['Title2','Title']].head()

In [0]:
# upper
df['Title2'] = df['Title'].str.upper() 
df[['Title2','Title']].head()

In [0]:
# length
df['Title2'] = df['Title'].str.len() 
df[['Title2','Title']].head()

In [0]:
# first Word
df['Title2'] = df['Title'].str.split(' ').str[0]
df[['Title2','Title']].head()

In [0]:
df['Title2'] = df['Title'].str.find('Squad', 0) 
df[['Title2','Title']].head()

### Date manipulation

In [0]:
pd.to_datetime('2010/11/12')

### Sort

In [0]:
df.sort_values(by='Title', ascending=True)

In [0]:
df.sort_values(by=['Director','Year'], ascending=True)

## Row manipulation

### Row Filter

In [0]:
df[df.Title == 'Prometheus']

In [0]:
df[df.Rating >= 8.5]

In [0]:
df[(df.Year == 2016) & (df.Rating >= 8.5)]

In [0]:
titel = ['Prometheus','Sing', 'Guardians of the Galaxy']
df[df.Title.isin(titel)]

In [0]:
years = [2010,2015,2002]
df[df.Year.isin(years)]

In [0]:
# Selects rows 1-to-3
df.iloc[0:3]

In [0]:
# First 4 rows and first 2 columns
df.iloc[0:4, 0:2]

## Table Manipulation

### Group By

In [0]:
df.groupby("Year")["Title"].count().to_frame() # Anzahl Titel pro Jahr

In [0]:
df.groupby(["Year","Director"])["Title"].count().to_frame().reset_index() # Anzahl Titel pro Jahr und pro Director

In [0]:
df.groupby(["Director"])["Title"].count().to_frame(name = 'count').reset_index() # Anzahl Titel pro Director -> in DataFrame

In [0]:
df.groupby(["Year","Director"])["Revenue (Millions)"].sum().to_frame().reset_index() # Total Revenue pro Jahr und pro Director

In [0]:
df.groupby("Director")["Rating"].mean().to_frame().reset_index() # Rating-Mean pro Director

In [0]:
df.groupby("Director")["Rating","Runtime (Minutes)"].mean() # Rating-Mean pro Director

In [0]:
df.groupby(["Year","Director"]).agg(
    {
         'Title':"count",  # Anzahl Titel pro Jahr und pro Director
         'Rating':"mean",  # Rating-Mean pro Director
         'Revenue (Millions)': "sum"  # Total Revenue pro Jahr und pro Director
    }
).reset_index() 

### Pivot / Unpivot

In [0]:
# Pivot
pd.pivot_table(df,index=["Director"]).reset_index()

In [0]:
# Pivot
df4 = pd.pivot_table(df,index=["Director","Year"],values=["Revenue (Millions)"],aggfunc=np.sum).reset_index()
df4

In [0]:
# Unpivot
df4 = pd.pivot_table(df,values=['Rating'], columns=['Year']).reset_index()
df4

In [0]:
df4.melt(id_vars=['index'],var_name='Year',value_name='Title')

### Join

In [0]:
# join df with df3
df3 = df.groupby(["Director"])["Title"].count().to_frame(name = 'count').reset_index() 

pd.merge(df,df3[['Director','count']], left_on=['Director'], right_on=['Director'],how = 'left') # how = rigtht, left, inner, outer


### Concat

In [0]:
df.append(df2) # df2 an df anhängen. Columns müssen gleich sein

In [0]:
pd.concat([df, df2],axis=0) # concat von zwei DataFrames