# PANDAS
Visit the official website of __Pandas__ for more content in __[this link](https://pandas.pydata.org)__
<img src="https://s3-ap-south-1.amazonaws.com/av-blog-media/wp-content/uploads/2018/03/pandas.jpg"
alt="pandas" title="Pandas" height="400" width="400" />

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np

# Importing Data

In [None]:
# Import data from CSV file
pd.read_csv(filename)

# Import data from a delimited text file (like TSV)
pd.read_table(filename)

# Import data from an Excel file
pd.read_excel(filename)

# Read from a SQL table/database
pd.read_sql(query, connection_object)

# Import data from a JSON formatted string, this could be a URL or a file
pd.read_json(json_string)

# Parse an html URL, string or file and extracts tables to a list of dataframes
pd.read_html(url)

# Take the content of your clipboard and passes it to read_table()
# If you do not know how to copy to your clipboard, I recommend this link
# http://ask.xmodulo.com/copy-file-content-clipboard-linux-desktop.html
pd.read_clipboard()

# From a dict, keys for columns names, values for data as lists
pd.DataFrame(var_dict)

# Exporting Data

In [None]:
# Write to a CSV file
df.to_csv(filename)

# Write to an Excel file
df.to_excel(filename)

# Write to a SQL table
df.to_sql(table_name, connection_object)

# Write a file in JSON format
df.to_json(filename)

# Create Test Objects

In [None]:
# Useful for testing code segments

# 5 Columns and 20 rows of random floats
pd.DataFrame(np.random.rand(20, 5))

# Create a series from an iterable 'my_list'
pd.Series(my_list)

# Add a date index
df.index = pd.date_range('1900/1/30', periods=df.shape[0])

# Viewing and inspecting data

In [None]:
# First n rowa of the DataFrame
df.head(n)

# Last n rows of the DataFrame
df.tail(n)

# Number of rows and columns
df.shape()

# Index, Datatype and Memory information
df.info()

# Summary statistics for numerical columns
df.describe()

# View unique values and counts
s.value_count(dropna=False)

# Unique values and counts for all columns
df.apply(pd.Series.value_counts)

# Selection

In [None]:
# Returns columns with label col as Series
df[col]

# Returns columns as a new DataFrame
df[[col1, col2]]

# Selection by Position
s.iloc[0]

# Selection by index
s.loc['index_one']

# First row
df.iloc[0, :]

# First element of first column
df.iloc[0, 0]

# Data Cleaning

In [None]:
# Rename columns
df.columns = ['a', 'b', 'c']

# Checks for null Values, Returns Boolean Array
pd.isnull()

# Opposite of pd.isnull()
pd.notnull()

# Drop all columns that contain null values
df.dropna(axis=1)

# Drop all rows have less than n non null values
df.dropna(axis=1, thresh=n)

# Replace all null values with x
df.fillna(x)

# Replace all null values with the mean*mean can be replace with almost any function from the statistics section)
s.fillna(s.mean())

# Convert the datatype of  the series to float
s.astype(float)

# Replace all values equal to 1 with 'one'
s.replace(1, 'one')

# Replace all 1 with 'one' and 3 with 'three'
s.replace([1, 3], ['one', 'three'])

# Mass renaming of columns
df.rename(columns=lambda x: x + 1)

# Selective renaming
df.rename(columns={'old_name': 'new_name'})

# Change the index
df.set_index('column_one')

# Mass renaming of index
df.rename(index=lambda x: x + 1)

# Filter, sort and groupby

In [None]:
# Rows where the column col is great than 0.5
df[df[col] > 0.5]

# Rows where 0.7 > col > 0.5
df[(df[col] > 0.5) & (df[col] < 0.7)]

# Sort values by col1 in ascending order
df.sort_values(col1)

# Sort values by col2 in descending order
df.sort_values(col2, ascending=False)

# Sort values by col1 in ascending order then col2 in descending order
df.sort_values([col1, col2], ascending=[True, False])

# Returns a groupby object for values from one column
df.groupby(col)

# Returns groupby object for values from multiple columns
df.groupby([col1, col2])

# Returns the mean of the values in col2, grouped by the values in col1 (mean can be replaced with almost any function from the statistics section)
df.groupby(col1)[col2]

# Create a pivot table that groups by col1 and calculates the mean of col2 and col3
df.pivot_table(index=col1, values=[col2, col3], aggfunc=mean)

# Find the average across all columns for every unique col1 group
df.groupby(col1).agg(np.mean)

# Apply the function np.max() acrosss each row
nf.apply(np.max, axis=1)

# Join and combine data

In [None]:
# Add the rows in df1 to the end of df2 (columns should be identical)
df1.append(df2)

# Add the columns in df1 to the end of df2 (rows should be identical)
pd.concat([df1, df2], axis=1)

# SQL-style join the columns in df1 with the columns on df2 where
# the rows for col have identical values.
# How can be one of 'left', 'right', 'outer', 'inner'
df1.join(df2, on=col1, how='inner')

# Statistics

In [None]:
# These can all be applied to a series as well

# Summary statistics for numerical columns
df.describe()

# Returns the mean of all columns
df.mean()

# Returns the correlation between columns in a DataFrame
df.corr()

# Returns the number of non-null values in each DataFrame columns
df.count()

# Rethr tbe highest value in each column
df.max()

# Returns the lowest value in each columns
df.min()

# Return the median of each column
df.median()

# Returns the standard deviation of each column
df.std()