# Python For Data Analysis Group 2
Github Repository: https://github.com/educated-fool/entity-resolution-group2

## Module 1: RecordLinkage Using Features Weighted Average
Author: Sixuan Li

In [3]:
# Importing the necessary libraries
import pandas as pd

# Reading the left and right datasets from CSV files
left_df = pd.read_csv('./data/left_dataset.csv')
right_df = pd.read_csv('./data/right_dataset.csv')

In [4]:
from src.module1_recordlinkage import record_linkage_pipeline

In [5]:
record_linkage_pipeline(left_df, right_df, filename='./data/recordlinkage_submission.csv', threshold=0.85)

Unnamed: 0,Unnamed: 1,left_dataset,right_dataset,confidence_score
7,84020,8,84021,1.00
56,32736,57,32737,1.00
59,39158,60,39159,0.92
59,39236,60,39237,0.92
66,77382,67,77383,1.00
...,...,...,...,...
94474,165,94475,166,1.00
94474,2372,94475,2373,1.00
94538,79362,94539,79363,1.00
94545,83309,94546,83310,0.92


## Module 2: RecordLinkage Using Features Mean
Author: Sixuan Li

In [6]:
import pandas as pd
left_df = pd.read_csv('./data/left_dataset.csv')
right_df = pd.read_csv('./data/right_dataset.csv')

In [4]:
from src.module2_recordlinkage import record_linkage_pipeline_full_address

In [7]:
record_linkage_pipeline_full_address(left_df, right_df, filename='./data/recordlinkage2_submission.csv')

Unnamed: 0,Unnamed: 1,left_dataset,right_dataset,confidence_score
1,75686,2,75687,0.81
7,81911,8,81912,0.83
7,84020,8,84021,0.90
14,51941,15,51942,0.81
19,87105,20,87106,0.81
...,...,...,...,...
94559,74490,94560,74491,0.82
94570,72779,94571,72780,0.81
94570,75425,94571,75426,0.83
94578,80356,94579,80357,0.97


## Module 3: TheFuzz Using Features Mean
Author: Xueni Wang

In [1]:
import pandas as pd
left_df = pd.read_csv('./data/left_dataset.csv')
right_df = pd.read_csv('./data/right_dataset.csv')

In [2]:
from src.module3_thefuzz import thefuzz_pipeline

In [3]:
thefuzz_pipeline(left_df, right_df, output_csv = "./data/thefuzz_submission.csv")

Unnamed: 0,left_dataset,right_dataset,confidence_score
0,60,39237,0.81
1,534,42420,0.89
2,1337,39545,0.83
3,2651,49857,0.86
4,3214,49285,0.91
...,...,...,...
14927,58215,88924,0.85
14928,16504,84508,0.84
14929,38004,84508,0.91
14930,72169,91487,0.89


## Module 4: fnmatch + textdistance
Author: Margaret Ma
- **Data Cleaning**: Standardizes postal codes and cleans text fields to ensure uniformity, facilitating accurate comparisons.
- **Blocking Keys Creation**: Forms a block_key for each entry using the initial characters of the address, name, state, and postal code. This key reduces the comparison scope by grouping similar entries together.
- **Matching and Scoring**: Employs fnmatch to find potential name matches and uses textdistance to calculate similarity scores. Matches are confirmed and scored higher if the names are very similar, and a secondary similarity measure is used for others.
- **Output**: Entries with a confidence score above 0.8 are identified as high-confidence matches and saved to a CSV file.

In [1]:
import pandas as pd
left_df = pd.read_csv('./data/left_dataset.csv')
right_df = pd.read_csv('./data/right_dataset.csv')

In [2]:
from src.module4_fnmatch_textdistance import fnmatch_textdistance_pipeline

In [3]:
fnmatch_textdistance_pipeline(left_df, right_df, filename = './data/fnmatch_textdistance_submission.csv', threshold = 0.8)

Unnamed: 0,left_dataset,right_dataset,confidence_score
251,8,81912,1.000
293,8,84021,1.000
950,15,57146,0.885
3561,57,32737,1.000
3707,60,39159,1.000
...,...,...,...
6837734,94475,166,1.000
6837742,94475,2373,1.000
6842155,94539,79363,1.000
6842431,94546,83310,1.000


## Module 5: Dedupe
Author: Xinyi Yu

In [1]:
import pandas as pd
left_df = pd.read_csv('./data/left_dataset.csv')
right_df = pd.read_csv('./data/right_dataset.csv')

In [2]:
from src.module5_dedupe import dedupe_pipeline 

In [3]:
dedupe_pipeline(left_df, right_df, filepath = './data/dedupe_submission.csv')

name : dammanns garden company
address : 5129 s emerson ave
city : indianapolis
state : IN
postal_code : 46237

name : dammanns garden company llc
address : 5129 s emerson ave
city : indianapolis
state : IN
postal_code : 46237

0/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished
name : the ultimate bake shoppe of ardmore
address : 120 coulter ave
city : ardmore
state : PA
postal_code : 19003

name : the ultimate bake shoppe of ardmore llc
address : 120 coulter ave suite 3
city : ardmore
state : PA
postal_code : 19003

1/10 positive, 0/10 negative
Do these records refer to the same thing?
(y)es / (n)o / (u)nsure / (f)inished / (p)revious
name : quick lube plus
address : 6014 e hillsborough ave ste c
city : tampa
state : FL
postal_code : 33610

name : quick lube plus tires inc
address : 6014 e hillsborough ave ste c
city : tampa
state : FL
postal_code : 33610

2/10 positive, 0/10 negative
Do these records refer to the same thing?
(y

Unnamed: 0,left_dataset,right_dataset,confidence_score
0,17919,74720,0.975422
1,64766,753,0.975422
2,49662,10025,0.975422
3,77310,73888,0.975422
4,25853,20450,0.975422
...,...,...,...
5251,72760,37980,0.800886
5252,9831,3532,0.800503
5253,50011,58100,0.800399
5254,42587,67573,0.800152
