**PROBLEM STATEMENT**

- Suppose you have two dataframe df1 and df2 , both have below columns :-
      
      df1 =>  id, name, mobno
      df2 => id, pincode, address, city

 - After joining both the dataframe on the basis of key i.e **id**, while  selecting **id,name,mobno,pincode, address, city**
 and you are getting an **error ambiguous** column id. How would you resolve it ?

In [0]:
from pyspark.sql.functions import col

# Sample data for df1 and df2
data1 = [(1, "Alice", "123456"), (2, "Bob", "789012")]
data2 = [(1, "12345", "123 Main St", "CityA"), (2, "67890", "456 Elm St", "CityB")]

columns1 = ["id", "name", "mobno"]
columns2 = ["id", "pincode", "address", "city"]

df1 = spark.createDataFrame(data1, columns1)
display(df1)

df2 = spark.createDataFrame(data2, columns2)
display(df2)

id,name,mobno
1,Alice,123456
2,Bob,789012


id,pincode,address,city
1,12345,123 Main St,CityA
2,67890,456 Elm St,CityB


**Ambiguous error**

     join_df = df1.join(df2, df1["id"] == df2["id"], how="inner")
                             (or)
     join_df_amb = df1.join(df2, col("id") == col("id"), how="inner")
                             (or)
     joined_df = df1.join(df2, on="id", how="inner")

In [0]:
join_df = df1.join(df2, df1["id"] == df2["id"], how="inner")
display(join_df)

id,name,mobno,id.1,pincode,address,city
1,Alice,123456,1,12345,123 Main St,CityA
2,Bob,789012,2,67890,456 Elm St,CityB


In [0]:
# Select specific columns with aliases
result_df = join_df.select(col("id"),
                                    col("name"),
                                    col("mobno"),
                                    col("pincode"),
                                    col("address"),
                                    col("city"))
display(result_df)

[0;31m---------------------------------------------------------------------------[0m
[0;31mAnalysisException[0m                         Traceback (most recent call last)
File [0;32m<command-1021705985537567>:2[0m
[1;32m      1[0m [38;5;66;03m# Select specific columns with aliases[39;00m
[0;32m----> 2[0m result_df [38;5;241m=[39m join_df[38;5;241m.[39mselect(col([38;5;124m"[39m[38;5;124mid[39m[38;5;124m"[39m),
[1;32m      3[0m                                     col([38;5;124m"[39m[38;5;124mname[39m[38;5;124m"[39m),
[1;32m      4[0m                                     col([38;5;124m"[39m[38;5;124mmobno[39m[38;5;124m"[39m),
[1;32m      5[0m                                     col([38;5;124m"[39m[38;5;124mpincode[39m[38;5;124m"[39m),
[1;32m      6[0m                                     col([38;5;124m"[39m[38;5;124maddress[39m[38;5;124m"[39m),
[1;32m      7[0m                                     col([38;5;124m"[39m[38;5;124mcity[3

**Method 01**

In [0]:
join_df1 = df1.join(df2, on="id", how="inner")
display(join_df1)

id,name,mobno,pincode,address,city
1,Alice,123456,12345,123 Main St,CityA
2,Bob,789012,67890,456 Elm St,CityB


In [0]:
# Select specific columns with aliases
result_df1 = join_df1.select(col("id"),
                            col("name"),
                            col("mobno"),
                            col("pincode"),
                            col("address"),
                            col("city"))
display(result_df1)

id,name,mobno,pincode,address,city
1,Alice,123456,12345,123 Main St,CityA
2,Bob,789012,67890,456 Elm St,CityB


**Method 02**

In [0]:
# Rename one of the 'id' columns to avoid ambiguity
# For example, renaming 'id' in df1 to 'df1_id'
df3 = df1.withColumnRenamed('id', 'id_df1')
display(df3)

df4=df2
display(df4)

id_df1,name,mobno
1,Alice,123456
2,Bob,789012


id,pincode,address,city
1,12345,123 Main St,CityA
2,67890,456 Elm St,CityB


In [0]:
# Join the DataFrames on 'id'
joined_df1 = df3.join(df4, df3["id_df1"] == df4["id"], how="inner")
display(joined_df1)

id_df1,name,mobno,id,pincode,address,city
1,Alice,123456,1,12345,123 Main St,CityA
2,Bob,789012,2,67890,456 Elm St,CityB


In [0]:
# Select columns from both DataFrames
result_df2 = joined_df1.select('id', 'name', 'mobno', 'pincode', 'address', 'city')

# Show the result
display(result_df2)

id,name,mobno,pincode,address,city
1,Alice,123456,12345,123 Main St,CityA
2,Bob,789012,67890,456 Elm St,CityB
