# Comparison
[Comparison with spreadsheets](https://pandas.pydata.org/docs/getting_started/comparison/comparison_with_spreadsheets.html)
Since many potential pandas users have some familiarity with spreadsheet programs like Excel, this page is meant to provide some examples of how various spreadsheet operations would be performed using pandas. This page will use terminology and link to documentation for Excel, but much will be the same/similar in Google Sheets, LibreOffice Calc, Apple Numbers, and other Excel-compatible spreadsheet software.

In [42]:
import pandas as pd
import numpy as np


tips = pd.read_csv('data/tips.csv', delimiter=',')

# Operations on Columns / Formulas in Spreadsheets
tips['total_bill'] = tips['total_bill'] - 2
tips['new_bill'] = tips['total_bill'] / 2

# Filter
# tips = tips[tips['new_bill'] > 10]
# is_dinner = tips['time'] == 'Dinner'
# tips = tips[is_dinner]
# print(tips['new_bill'].count())

# Conditional Column Creation / IF Statements in Spreadsheets e.g. IF(A1 < 10, "low", "high")
tips['bucket'] = np.where(tips['total_bill'] < 10, 'low', 'high')
filter_bucket = tips['bucket'] == 'high'
# tips = tips[filter_bucket]

# Date Functions / Date and Time Functions in Spreadsheets
tips['date1'] = pd.Timestamp('2013-01-15')
tips['date2'] = pd.Timestamp('2015-02-15')
tips['date1_year'] = tips['date1'].dt.year  # Extract year from date / YEAR(K2) in Spreadsheets
tips['date2_month'] = tips['date2'].dt.month # Extract month from date / MONTH(K2) in Spreadsheets
tips['date1_next'] = tips['date1'] + pd.offsets.MonthBegin() # Add one month to date / =DATE(YEAR(K2);MONTH(K2)+1;1) Spreadsheets
tips['months_between'] = tips['date2'].dt.to_period('M') - tips['date1'].dt.to_period('M') # Difference in months between two dates / DATEDIF(K2;L2;"M") in Spreadsheets

# Selecting Columns / Selecting Cells in Spreadsheets
# tips[['total_bill', 'tip']]  # Select multiple columns
# tips.drop("sex", axis=1, inplace=True)  # Drop a column
# tips.rename(columns={'total_bill': 'total_bill2'}, inplace=True)  # Rename a column

# tips = tips.sort_values(['sex'], ascending=True)  # Sort by a column

# print(tips['time'].str.len())  # String length of column
tips['FIND()'] = tips['sex'].str.find('ale') # String search / FIND('ale';C2) in Spreadsheets
tips['MID()'] = tips['sex'].str[0:1] # String slicing / MID(C2;1;1) in Spreadsheets
print(tips)




     total_bill   tip     sex smoker   day    time  size  new_bill bucket  \
0         14.99  1.01  Female     No   Sun  Dinner     2     7.495   high   
1          8.34  1.66    Male     No   Sun  Dinner     3     4.170    low   
2         19.01  3.50    Male     No   Sun  Dinner     3     9.505   high   
3         21.68  3.31    Male     No   Sun  Dinner     2    10.840   high   
4         22.59  3.61  Female     No   Sun  Dinner     4    11.295   high   
..          ...   ...     ...    ...   ...     ...   ...       ...    ...   
239       27.03  5.92    Male     No   Sat  Dinner     3    13.515   high   
240       25.18  2.00  Female    Yes   Sat  Dinner     2    12.590   high   
241       20.67  2.00    Male    Yes   Sat  Dinner     2    10.335   high   
242       15.82  1.75    Male     No   Sat  Dinner     2     7.910   high   
243       16.78  3.00  Female     No  Thur  Dinner     2     8.390   high   

         date1      date2  date1_year  date2_month date1_next  \
0   2013-0

# Extracting nth word
In Excel, you might use the [Text to Columns Wizard](https://support.microsoft.com/en-us/office/split-text-into-different-columns-with-the-convert-text-to-columns-wizard-30b14928-5550-41f5-97ca-7a3e9c363ed7) for splitting text and retrieving a specific column. (Note it’s possible to do so through a formula as well.)

The simplest way to extract words in pandas is to split the strings by spaces, then reference the word by index. Note there are more powerful approaches should you need them.

In [43]:
df = pd.DataFrame({'String': ['John Smith', 'Jane Doe', 'Alice Johnson']})

# Extracting first and last names
df['first_name'] = df['String'].str.split(" ", expand=True)[0]  # Extract first word
df['last_name'] = df['String'].str.split(" ", expand=True)[1]  # Extract second word
df['title'] = df['String'].str.title()

# Convert first name to uppercase and last name to lowercase
df['first_name'] = df['first_name'].str.upper() # Convert to uppercase
df['last_name'] = df['last_name'].str.lower()  # Convert to lowercase

df

Unnamed: 0,String,first_name,last_name,title
0,John Smith,JOHN,smith,John Smith
1,Jane Doe,JANE,doe,Jane Doe
2,Alice Johnson,ALICE,johnson,Alice Johnson


# Merging

In [76]:
df1 = pd.DataFrame({'key': ['A', 'B', 'C', 'D'], 'Value': np.random.randn(4)})
df2 = pd.DataFrame({'key': [ 'B', 'D', 'D', 'E'], 'Value': np.random.rand(4)})


inner_join = df1.merge(df2, on=['key'], how='inner')
outer_join = df1.merge(df2, on=['key'], how='outer')
left_join = df1.merge(df2, on=['key'], how='left')
right_join = df1.merge(df2, on=['key'], how='right')


left_join

Unnamed: 0,key,Value_x,Value_y
0,A,-0.74009,
1,B,-1.60361,0.149624
2,C,-0.165419,
3,D,-0.716529,0.200407
4,D,-0.716529,0.735563
