In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing 
import warnings
warnings.filterwarnings("ignore")
from sklearn.cluster import KMeans

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score
from sklearn.impute import SimpleImputer
from sklearn.metrics import adjusted_rand_score, homogeneity_score, completeness_score, v_measure_score

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns 
from wordcloud import WordCloud, STOPWORDS
from collections import Counter

In [None]:
# Importing CSV file
df = pd.read_csv('/kaggle/input/us-airline-flight-routes-and-fares-1993-2024/US Airline Flight Routes and Fares 1993-2024.csv', low_memory=False)

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
df = df.dropna()

In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
# WordCloud
text = " ".join(i for i in df.city2)  # butun satirlar tek metinde kaydedilir

wordcloud = WordCloud().generate(text)
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.figure(figsize=[40, 30])
plt.show()

* Yukarıdaki wordcloud'da uçuşların varış şehirleri gösterilmektedir. En çok uçusun yapıldığı şehirler, daha belirgin bir şekilde gösterilmiştir.

In [None]:
df = df.drop(columns = ["tbl" , "citymarketid_1", "citymarketid_2" , "city1" , "city2" , "Geocoded_City1", "Geocoded_City2", "tbl1apk"])

In [None]:
label_encoder = preprocessing.LabelEncoder() 

cols = ["carrier_lg", "carrier_low", "airport_1", "airport_2"]

for col in cols:
    # Encode labels in column 'species'. 
    df[col]= label_encoder.fit_transform(df[col]) 

    df[col].unique()

* Label Encoding ile kategorik veriler sayısal verilere dönüştürüldü.

In [None]:
df_corr = df[['Year', 'airportid_1', 'airportid_2', 'fare','carrier_low','carrier_lg']]

In [None]:
df_corr.head()

In [None]:
df_corr = df_corr.corr()

In [None]:
plt.figure(figsize = (8, 6))
sns.heatmap(data = df_corr, annot = True, cmap = 'BuGn')
plt.show()

* Bu tablo, değişkenlerin birbirleri arasındaki ilişkiyi temsil etmektedir. 1'e daha yakın olan değerler iki değişken arasındaki ilişkinin daha yüksek olduğunu gösterir. 
* Örneğin "fare" ve "Year" değişkenleri arasındaki ilişki diğer değişkenlere göre daha yüksektir.

In [None]:
df_corr = df[["fare_lg","fare_low",'carrier_low','carrier_lg']]
df_corr = df_corr.corr()
plt.figure(figsize = (8, 6))
sns.heatmap(data = df_corr, annot = True, cmap = 'BuGn')
plt.show()

* Bu ısı haritasına göre "fare_low" ve "fare_lg" arasındaki ilişinin 1'e yakın olması bu iki değişken arasındaki ilişkinin yüksek olduğunu gösterir.

In [None]:

sns.scatterplot(data = df, x = "nsmiles", y = "fare",hue="quarter", palette = sns.color_palette('pastel'))
plt.show()

* Bu grafikte,"nsmiles" ve "fare" değişkenlerinin ilişkisinin yılın çeyreklerine yani "quarter" değişkenine göre dağılımını görüyoruz.

In [None]:
sns.scatterplot(data = df, x = "passengers", y = "fare",hue="quarter", palette = sns.color_palette('deep'))
plt.show()

* Bu grafik, yılın çeyreklerinde, yolcuların ödediği ücretin yolcu sayısına göre dağılımını gösterir.

In [None]:
df.info()

In [None]:
y = df[["fare"]]
x = df.drop("fare", axis=1)

* y bağımlı değişkeni hedef değer olan ücreti, x bağımsız değişkenleri ise bu ücreti etkileyen faktörleri temsil etmektedir.

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x,y,train_size=0.70,random_state=22)

# **Linear Regression**

In [None]:
lm = LinearRegression()
model=lm.fit(x,y)

In [None]:
model.score(x, y)

In [None]:
lm = LinearRegression()
model = lm.fit(x_train,y_train)
model.score(x_test,y_test)

In [None]:
lm.fit(x_train, y_train)
y_test_p = lm.predict(x_test)
test_r2 = r2_score(y_test, y_test_p)

print(f"Model: Linear Regression")
print(f"Test R-Squared Score: {test_r2:.5f}\n")

scores = cross_validate(lm, x_train, y_train,
                            scoring = ['r2', 'neg_mean_absolute_error',
                                       'neg_mean_squared_error', 
                                       'neg_mean_absolute_percentage_error'],
                            cv = 10, return_train_score = False)

scores = pd.DataFrame(scores, index = range(1,11))

print(scores.iloc[:, 2:].mean().abs().apply("{:.5f}".format))


* Burada seçilen linear regression modelinin; r squared, MAE, MSE, MAPE değerlerini bulduk.

# **Decision Tree**

In [None]:
dt = DecisionTreeRegressor()
dt.fit(x_train, y_train)
y_pred = dt.predict(x_test)

In [None]:
dt.score(x_test, y_test)

In [None]:
dt.fit(x_train, y_train)
y_test_p = dt.predict(x_test)
test_r2 = r2_score(y_test, y_test_p)

print(f"Model: Decision Tree")
print(f"Test R-Squared Score: {test_r2:.5f}\n")

scores = cross_validate(dt, x_train, y_train,
                            scoring = ['r2', 'neg_mean_absolute_error',
                                       'neg_mean_squared_error', 
                                       'neg_mean_absolute_percentage_error'],
                            cv = 10, return_train_score = False)

scores = pd.DataFrame(scores, index = range(1,11))

print(scores.iloc[:, 2:].mean().abs().apply("{:.5f}".format))

* Burada seçilen decision tree modelinin; r squared, MAE, MSE, MAPE değerlerini bulduk.

# **K-Nearest Neighbor - KNN**

In [None]:
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)

In [None]:
knn.score(x_test, y_test)

In [None]:
knn.fit(x_train, y_train)
y_test_p = knn.predict(x_test)
test_r2 = r2_score(y_test, y_test_p)

print(f"Model: K-Nearest Neighbor - KNN")
print(f"Test R-Squared Score: {test_r2:.5f}\n")

scores = cross_validate(knn, x_train, y_train,
                            scoring = ['r2', 'neg_mean_absolute_error',
                                       'neg_mean_squared_error', 
                                       'neg_mean_absolute_percentage_error'],
                            cv = 10, return_train_score = False)

scores = pd.DataFrame(scores, index = range(1,11))

print(scores.iloc[:, 2:].mean().abs().apply("{:.5f}".format))

# **K-Ortalama (k-Means) Kümeleme**

In [None]:
kmeans = KMeans(n_clusters=2, random_state=42)
kmeans.fit(x)

y = y.values.ravel()  

# Kümeleri değerlendirmek için Adjusted Rand Index kullan
print("Adjusted Rand Index:", adjusted_rand_score(y, kmeans.labels_))
print("Homojenlik Skoru:", homogeneity_score(y, kmeans.labels_))
print("Completentlik Skoru:", completeness_score(y, kmeans.labels_))
print("V-Measure Skoru:", v_measure_score(y, kmeans.labels_))

# **MODEL SEÇİMİ**

* Gözetimli ve gözetimsiz öğrenme algoritma skorlarına göre en iyi skora sahip olan model olan linear regresion modeli ile hiperparametre optimizasyonu ile devam ediyoruz

# **HİPERPARAMETRE OPTİMİZASYONU**

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import Ridge
from scipy.stats import uniform

In [None]:
ridge = Ridge()


param_grid = {
    'alpha': np.logspace(-4, 4, 50)  
}


grid_search = GridSearchCV(ridge, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)


grid_search.fit(x_train, y_train)


print("Best parameters:", grid_search.best_params_)


print("Best score (MSE):", grid_search.best_score_)



In [None]:

y_pred_continuous = model.predict(x_train)


threshold = np.mean(y_train)  


y_train_class = np.where(y_train >= threshold, 1, 0)


y_pred_class = np.where(y_pred_continuous >= threshold, 1, 0)


accuracy = accuracy_score(y_train_class, y_pred_class)
precision = precision_score(y_train_class, y_pred_class)
recall = recall_score(y_train_class, y_pred_class)
f1 = f1_score(y_train_class, y_pred_class)


print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
