In [1]:
import jsonschema
import json
import jsondiff
import pandas as pd

from jsonschema import validate
from jsondiff import diff
from difflib import SequenceMatcher


In [2]:
# Define schema for TexasMusicVenue_SanAntonio
schema1 = {
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "Address": {
        "type": "string"
      },
      "Venue": {
        "type": ["string", "null", "integer"]
      },
      "Artist": {
        "type": ["string", "integer", "null"]
      },
      "Year": {
        "type": ["integer", "null"]
      }
    },
    "required": ["Year", "Address", "Venue", "Artist"]
  }
}


# Define schema for og_data

schema2= {
  "type": "array",
  "items": {
    "type": "object",
    "properties": {
      "Address": {
        "type": "null"
      },
      "Venue": {
        "type": ["string", "null", "integer"]
      },
      "Artist": {
        "type": ["string", "integer", "null"]
      },
      "Year": {
        "type": ["integer"]
      }
    },
    "required": ["Year", "Address", "Venue", "Artist"]
  }
}



# Load and parse file1.json
with open('TexasMusicVenue_SanAntonio.json') as json_file:
    json_data1 = json.load(json_file)

# Load and parse file2.json
with open('og_data.json') as json_file:
    json_data2 = json.load(json_file)

# Validate file1.json against schema1
try:
    validate(json_data1, schema1)
    print("File 1 is valid.")
except jsonschema.ValidationError as e:
    print("File 1 is invalid. Error: ", e)

# Validate file2.json against schema2
try:
    validate(json_data2, schema2)
    print("File 2 is valid.")
except jsonschema.ValidationError as e:
    print("File 2 is invalid. Error: ", e)


File 1 is valid.
File 2 is valid.


In [3]:
def compare_json_files(file1, file2):
    with open(file1, 'r') as f1, open(file2, 'r') as f2:
        json_data1 = json.load(f1)
        json_data2 = json.load(f2)

    return json_data1 == json_data2

# Example usage
file1 = 'TexasMusicVenue_SanAntonio.json'
file2 = 'og_data.json'

are_equal = compare_json_files(file1, file2)
if are_equal:
    print("The JSON files are identical.")
else:
    print("The JSON files are different.")


The JSON files are different.


In [4]:
def compare_similarity(file1_path, file2_path, key, threshold=0.8):
    # Read the JSON files
    with open(file1_path, 'r') as file1:
        json_data1 = json.load(file1)
    with open(file2_path, 'r') as file2:
        json_data2 = json.load(file2)

    # Extract values for the consistent key and address
    values1 = []
    addresses1 = []
    values2 = []
    addresses2 = []

    if isinstance(json_data1, list):
        for item in json_data1:
            if key in item:
                values1.append(item[key])
                if 'Address' in item:
                    addresses1.append(item['Address'])
                else:
                    addresses1.append(None)
    elif isinstance(json_data1, dict) and key in json_data1:
        values1.append(json_data1[key])
        if 'Address' in json_data1:
            addresses1.append(json_data1['Address'])
        else:
            addresses1.append(None)

    if isinstance(json_data2, list):
        for item in json_data2:
            if key in item:
                values2.append(item[key])
                if 'Address' in item:
                    addresses2.append(item['Address'])
                else:
                    addresses2.append(None)
    elif isinstance(json_data2, dict) and key in json_data2:
        values2.append(json_data2[key])
        if 'Address' in json_data2:
            addresses2.append(json_data2['Address'])
        else:
            addresses2.append(None)

    # Compare similarity using Levenshtein distance
    matches = []
    for i, value1 in enumerate(values1):
        for j, value2 in enumerate(values2):
            similarity_ratio = SequenceMatcher(None, str(value1), str(value2)).ratio()
            if similarity_ratio >= threshold:
                match = {
                    'Value1': value1,
                    'Address1': addresses1[i],
                    'Value2': value2,
                    'Address2': addresses2[j],
                    'Similarity': similarity_ratio
                }
                matches.append(match)

    # Create a dataframe of the matches
    df = pd.DataFrame(matches, columns=['Value1', 'Address1', 'Value2', 'Address2', 'Similarity'])
    return df

# Example usage
file1_path = 'TexasMusicVenue_SanAntonio.json'
file2_path = 'og_data.json'
consistent_key = 'Venue'

matches_df = compare_similarity(file1_path, file2_path, consistent_key, threshold=0.8)
print(matches_df) 

                   Value1                                   Address1  \
0             AT&T Center       1 AT&T Center, San Antonio, TX 78219   
1             AT&T Center       1 AT&T Center, San Antonio, TX 78219   
2             AT&T Center       1 AT&T Center, San Antonio, TX 78219   
3             AT&T Center       1 AT&T Center, San Antonio, TX 78219   
4             AT&T Center       1 AT&T Center, San Antonio, TX 78219   
...                   ...                                        ...   
1157  Wetmore City Limits  12329 Wetmore Road, San Antonio, TX 78247   
1158  Wetmore City Limits  12329 Wetmore Road, San Antonio, TX 78247   
1159  Wetmore City Limits  12329 Wetmore Road, San Antonio, TX 78247   
1160  Wetmore City Limits  12329 Wetmore Road, San Antonio, TX 78247   
1161  Wetmore City Limits  12329 Wetmore Road, San Antonio, TX 78247   

                   Value2 Address2  Similarity  
0             AT&T Center     None         1.0  
1             AT&T Center     None   

In [5]:
def compare_similarity(file1_path, file2_path, key, threshold=0.8):
    # Read the JSON files
    with open(file1_path, 'r') as file1:
        json_data1 = json.load(file1)
    with open(file2_path, 'r') as file2:
        json_data2 = json.load(file2)

    # Extract values for the consistent key, Address, Artist, and Year
    venue1 = []
    venue2 = []
    addresses1 = []
    addresses2 = []
    artists1 = []
    artists2 = []
    years1 = []
    years2 = []

    if isinstance(json_data1, list):
        for item in json_data1:
            if key in item:
                venue1.append(item[key])
                addresses1.append(item['Address'])
                artists1.append(item['Artist'])
                years1.append(item['Year'])
    elif isinstance(json_data1, dict) and key in json_data1:
        venue1.append(json_data1[key])
        addresses1.append(json_data1['Address'])
        artists1.append(json_data1['Artist'])
        years1.append(json_data1['Year'])

    if isinstance(json_data2, list):
        for item in json_data2:
            if key in item:
                venue2.append(item[key])
                addresses2.append(item['Address'])
                artists2.append(item['Artist'])
                years2.append(item['Year'])
    elif isinstance(json_data2, dict) and key in json_data2:
        venue2.append(json_data2[key])
        addresses2.append(json_data2['Address'])
        artists2.append(json_data2['Artist'])
        years2.append(json_data2['Year'])

    # Compare similarity using Levenshtein distance
    matches = []
    for v1, address1, artist1, year1 in zip(venue1, addresses1, artists1, years1):
        for v2, address2, artist2, year2 in zip(venue2, addresses2, artists2, years2):
            similarity_ratio = SequenceMatcher(None, str(v1), str(v2)).ratio()
            if similarity_ratio >= threshold:
                matches.append((v1, v2, address1, address2, artist1, artist2, year1, year2))

    # Create a dataframe of the matches
    df = pd.DataFrame(matches, columns=['Venue1', 'Venue2', 'Address1', 'Address2', 'Artist1', 'Artist2', 'Year1', 'Year2'])

    # Merge Venue1 and Venue2 into a single Venue column
    df['Venue'] = df['Venue1'].combine_first(df['Venue2'])
    df.drop(['Venue1', 'Venue2'], axis=1, inplace=True)

    # Drop the unnecessary columns
    df.drop(['Artist1', 'Address2', 'Year1'], axis=1, inplace=True)

    # Reorder the columns
    df = df[['Year2', 'Venue', 'Artist2', 'Address1']]

    return df

# Example usage
file1_path = 'TexasMusicVenue_SanAntonio.json'
file2_path = 'og_data.json'
consistent_key = 'Venue'

matches_df = compare_similarity(file1_path, file2_path, consistent_key, threshold=0.8)
print(matches_df)


      Year2                Venue  \
0      2006          AT&T Center   
1      2006          AT&T Center   
2      2006          AT&T Center   
3      2006          AT&T Center   
4      2006          AT&T Center   
...     ...                  ...   
1157   1997  Wetmore City Limits   
1158   1997  Wetmore City Limits   
1159   1997  Wetmore City Limits   
1160   1999  Wetmore City Limits   
1161   1999  Wetmore City Limits   

                                               Artist2  \
0     San Antonio Stock Show and Rodeo: Dierks Bentley   
1        San Antonio Stock Show and Rodeo: Keith Urban   
2       San Antonio Stock Show and Rodeo: George Jones   
3        San Antonio Stock Show and Rodeo: Hilary Duff   
4           San Antonio Stock Show and Rodeo: Kid Rock   
...                                                ...   
1157                                      Michael Waid   
1158                  Ronny Cloud & Silver Lining Band   
1159                                      Dav

In [6]:
duplicate_counts = matches_df['Venue'].value_counts()
print(duplicate_counts)


Sunken Garden Theater                   217
Freeman Coliseum                        161
AT&T Center                             135
Tycoon Flats                             93
Sunset Station                           75
Alamodome                                73
Arneson River Theatre                    69
Sam's Burger Joint                       52
Lila Cockrell Theatre                    48
Tobin Center for the Performing Arts     41
The Cove                                 33
Bonham Exchange                          32
LUNA                                     16
Specht's Store                           13
Durty Nelly's                            13
Martinez Social Club                     12
Wetmore City Limits                      11
Six Flags Fiesta Texas                   10
Carmen's de la Calle Cafe                 7
San Antonio Botanical Garden              6
Guadalupe Cultural Arts Center            4
Josephine Theatre                         4
VFW Post 8541                   

In [7]:
venue_name = 'AT&T Center'
count = duplicate_counts[venue_name]
print(f"Count of {venue_name}: {count}")


Count of AT&T Center: 135


In [8]:
year_venue_info = matches_df.groupby(['Year2', 'Venue']).agg({'Artist2': 'unique'})
print(year_venue_info)

                                                                                      Artist2
Year2 Venue                                                                                  
1970  Freeman Coliseum                      [James Brown, San Antonio Stock Show & Rodeo: ...
      Sunken Garden Theater                                        [Country Joe and the Fish]
      Tobin Center for the Performing Arts                 [Duke Ellington and his Orchestra]
1971  Freeman Coliseum                      [San Antonio Stock Show and Rodeo: Roy Clark, ...
      Sunken Garden Theater                 [San Antonio Soul Festival: Pete Seeger, Rev. ...
...                                                                                       ...
2010  Sam's Burger Joint                    [Kinky Friedman, Mingo Fishtrap, Charlie and t...
      Six Flags Fiesta Texas                                                  [Jordin Sparks]
      Sunken Garden Theater                              [Te

In [9]:
# Store the dataframe as a JSON file
matches_df.to_json('matches.json', orient='records', indent=4)


Specific Year and Venue

In [10]:
start_year = 2006
end_year = 2022
venue_name = 'AT&T Center'

filtered_df = matches_df[(matches_df['Year2'] >= start_year) & (matches_df['Year2'] <= end_year) & (matches_df['Venue'] == venue_name)]
print(filtered_df)

     Year2        Venue                                            Artist2  \
0     2006  AT&T Center   San Antonio Stock Show and Rodeo: Dierks Bentley   
1     2006  AT&T Center      San Antonio Stock Show and Rodeo: Keith Urban   
2     2006  AT&T Center     San Antonio Stock Show and Rodeo: George Jones   
3     2006  AT&T Center      San Antonio Stock Show and Rodeo: Hilary Duff   
4     2006  AT&T Center         San Antonio Stock Show and Rodeo: Kid Rock   
..     ...          ...                                                ...   
130   2010  AT&T Center                                               Rush   
131   2010  AT&T Center                                            Shakira   
132   2010  AT&T Center  Carrie Underwood, Billy Currington, Sons of Sy...   
133   2010  AT&T Center                                      Justin Bieber   
134   2010  AT&T Center                           Trans-Siberian Orchestra   

                                 Address1  
0    1 AT&T Center,

In [11]:
years = filtered_df['Year2'].tolist()
artists = filtered_df['Artist2'].tolist()

print(f"Years and Artists for {venue_name}:")
for year, artist in zip(years, artists):
    print(f"{year}: {artist}")


Years and Artists for AT&T Center:
2006: San Antonio Stock Show and Rodeo: Dierks Bentley
2006: San Antonio Stock Show and Rodeo: Keith Urban
2006: San Antonio Stock Show and Rodeo: George Jones
2006: San Antonio Stock Show and Rodeo: Hilary Duff
2006: San Antonio Stock Show and Rodeo: Kid Rock
2006: San Antonio Stock Show and Rodeo: Bill Engvall
2006: San Antonio Stock Show and Rodeo: Cross Canadian Ragweed
2006: San Antonio Stock Show and Rodeo: Big & Rich
2006: San Antonio Stock Show and Rodeo: Montgomery Gentry
2006: San Antonio Stock Show and Rodeo: Bellamy Brothers/Kumbia Kings
2006: San Antonio Stock Show and Rodeo: Gary Allan
2006: San Antonio Stock Show and Rodeo: Clay Walker
2006: San Antonio Stock Show and Rodeo: Rascal Flatts  
2006: San Antonio Stock Show and Rodeo: Willie Nelson
2006: San Antonio Stock Show and Rodeo: Charlie Daniels Band/Lynryd Skynyrd
2006: San Antonio Stock Show and Rodeo: Wayne Newton
2006: George Strait, Miranda Lambert, Tracy Lawrence
2006: Tim McGr