# Project 2: Regression Project
Data Set: [Mercedes-Benz Greener Manufacturing](https://www.kaggle.com/c/mercedes-benz-greener-manufacturing) <br>
*\"Can you cut the time a Mercedes-Benz spends on the test bench?\"*

In [1]:
from google.colab import drive
drive.mount('/content/drive/')

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

sns.set(style="whitegrid")
title_font = {"family":"sans-serif", 
              "color":"gray", 
              "size":16, 
              "weight":"bold"}
axis_font = {"family":"sans", 
              "color":"gray", 
              "size":14, 
              "weight":"normal"}

train = pd.read_csv("/content/drive/My Drive/MertColab/proje2/train.csv")
test = pd.read_csv("/content/drive/My Drive/MertColab/proje2/test.csv")

# Let's convert the data of these columns to numeric values.
# e.g a=1, c=3, z=26, aa=27, ac=29, ba=54, bd=57 etc.
from string import ascii_lowercase

for data in (test, train):
    for col in data.columns[1:10]:
        temp = 1
        for var1 in ascii_lowercase:
            var2 = "a" + var1
            var3 = "b" + var1
            data[col] = data[col].replace(var1, temp)
            data[col] = data[col].replace(var2, temp+26)
            data[col] = data[col].replace(var3, temp+26*2)
            temp+=1
        data[col] = data[col].astype("int64")    # data types are also changed.

# Standardization of these numeric values
from sklearn.preprocessing import scale

for col in train.columns[2:10]:
    train[col] = scale(train[col])
    test[col] = scale(test[col])

# REGRESSION MODELS
import statsmodels.api as sm
import warnings
warnings.filterwarnings("ignore")

X = train.iloc[:, 2:]
Y = train.y
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

x_train = sm.add_constant(x_train)
results_model = sm.OLS(y_train, x_train)
results_ols = results_model.fit()
print("\nAdjusted R-square value of OLS Model: %.4f" %results_ols.rsquared_adj)

from sklearn.linear_model import Ridge, Lasso, ElasticNet, RidgeCV, LassoCV, ElasticNetCV
lasso_CV = LassoCV(alphas=np.logspace(-1, 10, 10), cv=10).fit(x_train, y_train)
lasso_score = lasso_CV.score(x_train, y_train)
print("Adjusted R-square value of Lasso Model: %.4f" %lasso_score)

ridge_CV = RidgeCV(alphas=np.logspace(-4, 100, 10), cv=10).fit(x_train, y_train)
ridge_score = ridge_CV.score(x_train,y_train)
print("Adjusted R-square value of Ridge Model: %.4f" %ridge_score)

elasticNet_CV = ElasticNetCV(alphas=np.logspace(-6, 100, 20), l1_ratio=0.5, cv=10).fit(x_train, y_train)
elasticNet_score = elasticNet_CV.score(x_train, y_train)
print("Adjusted R-square value of ElasticNet Model: %.4f" %elasticNet_score)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


  import pandas.util.testing as tm



Adjusted R-square value of OLS Model: 0.5575
Adjusted R-square value of Lasso Model: 0.5386
Adjusted R-square value of Ridge Model: 0.5915
Adjusted R-square value of ElasticNet Model: 0.5913


***
# Ödev 8.1 - KNN (Proje 2)

In [2]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

In [3]:
%%time
parameters = {"n_neighbors":range(1,11), "weights":("uniform", "distance")}
knn1 = KNeighborsRegressor(n_jobs=-1)
grid_cv = GridSearchCV(estimator=knn1, param_grid=parameters, cv=10)
grid_cv.fit(x_train, y_train)
print("En iyi eğitim parametreleri : ", grid_cv.best_params_)
print("En iyi eğitim skoru         : ", grid_cv.best_score_)

En iyi eğitim parametreleri :  {'n_neighbors': 10, 'weights': 'uniform'}
En iyi eğitim skoru         :  0.44491520463893935
CPU times: user 37.6 s, sys: 1.53 s, total: 39.2 s
Wall time: 2min 42s


In [4]:
grid_cv.fit(x_test, y_test)
print("En iyi test parametreleri : ", grid_cv.best_params_)
print("En iyi test skoru         : ", grid_cv.best_score_)

En iyi test parametreleri :  {'n_neighbors': 10, 'weights': 'distance'}
En iyi test skoru         :  0.4389149887349542


In [5]:
results = pd.DataFrame(grid_cv.cv_results_)
results = results[["param_n_neighbors", "param_weights", "mean_test_score"]]
results = results.sort_values(by="mean_test_score", ascending=False)
results 

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score
19,10,distance,0.438915
18,10,uniform,0.43437
17,9,distance,0.433511
16,9,uniform,0.430677
15,8,distance,0.427246
14,8,uniform,0.426491
12,7,uniform,0.424877
13,7,distance,0.424875
11,6,distance,0.404675
10,6,uniform,0.404357


In [6]:
knn2 = KNeighborsRegressor(n_neighbors=1, n_jobs=-1)
knn2.fit(x_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
                    weights='uniform')

In [7]:
knn3 = KNeighborsRegressor(n_neighbors=1, n_jobs=-1)
knn3.fit(x_test, y_test)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
                    weights='uniform')

In [8]:
print("Train score:", knn2.score(x_train, y_train))
print("Test score:", knn3.score(x_test, y_test))
## seems overfit. why?

Train score: 0.9628481344496057
Test score: 0.9893154261565721


---
***
---

# Project 3: Classification Project
Data Set: [Twitter US Airline Sentiment](https://www.kaggle.com/crowdflower/twitter-airline-sentiment) <br>
*\"Analyze how travelers in February 2015 expressed their feelings on Twitter\"*

In [9]:
%matplotlib inline
sns.set(style="whitegrid")

# calling our honor guest:
tweets = pd.read_csv("/content/drive/My Drive/MertColab/proje3/tweets.csv")

# ...and she takes off her fancy fur coat and some other unnecessary ornaments 
tweets.drop(columns=["tweet_id", "airline_sentiment_gold", "name", 
                     "negativereason_gold", "retweet_count", "tweet_created", 
                     "tweet_coord", "tweet_created", "tweet_location", 
                     "user_timezone"], 
            inplace=True)

# We are going to clean the tweets to keep only the English words
import re, string, nltk
nltk.download('words')
words = set(nltk.corpus.words.words())

# The twitter accounts mentioned:
set_them = set()
for i in tweets.text:
    search_them = re.search(r"(^|[^@\w])@(\w{1,15})\b", i)
    give_them = search_them.group().lower()
    set_them.add(give_them)
del set_them

# These are the Twitter accounts we will remove:
# @virginamerica, @united, @southwestair, @deltaassist, @usairways, @americanair

def tweet_cleaner(tweet):
    "Symbols, numbers & airline brand cleaner function!"
    # First step: Symbols and numbers are gone:
    cleaned_string = re.sub("[^a-zA-Z]", " ", tweet)

    # Party people here speak only English. No other languages are allowed:
    cleaned_string = " ".join(w for w in nltk.wordpunct_tokenize(cleaned_string) \
         if w.lower() in words)

    # Second step: Putting them to a list:
    cleaned_set = set(cleaned_string.lower().split())
    # Let's get rid of our meaningless one-letter friends: 
    what_to_clean = list(string.ascii_lowercase)
    # ...and some of their drunk buddies:
    what_to_clean.extend(["it", "of", "co", "to", "http"])
    # We don't need any officials in our party as well:
    what_to_clean.extend(["virginamerica", "united", 
                       "southwestair", "deltaassist",
                       "usairways", "americanair"])
    for delete in what_to_clean:
        try:
            cleaned_set.remove(delete)
        except:
            continue    
    return list(cleaned_set)

# We need to convert the "airline_sentiment" to numbers.
for i,j in enumerate(tweets.airline_sentiment.unique()):
    tweets.airline_sentiment = tweets.airline_sentiment.replace(j, i)
tweets.airline_sentiment.astype("int64")

# We are going to collect all the words and give them binary inputs (1 or 0)
# according to their entities in the tweets.
all_words = set()
rows=[]
for i in tweets.index:
    listed_tweet = tweet_cleaner(tweets.text[i])
    rows.append(listed_tweet)
    all_words.update(listed_tweet)

# filling the data frame with binary values according to the words
df_words = pd.DataFrame(0, columns=all_words, index=tweets.index)
for i in tweets.index:
    df_words.loc[i, rows[i]]=1

# it's time to concatenate the data frames
tweets = pd.concat([tweets, df_words], ignore_index=False, axis="columns")

# Linear Regression Models
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_validate, cross_val_score
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV

X = tweets.iloc[:, 6:]
Y = tweets.airline_sentiment
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

lr_model1 = LogisticRegression(C=0.1, solver="lbfgs", max_iter=300, n_jobs=-1)
lr_model1.fit(x_train, y_train)
train_score_model1 = lr_model1.score(x_train, y_train)
test_score_model1 = lr_model1.score(x_test, y_test)

print("\nTrain score with 'lbfgs' solver: {:.3f}".format(train_score_model1))
print("Test score with 'lbfgs' solver: {:.3f}".format(test_score_model1))

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!

Train score with 'lbfgs' solver: 0.824
Test score with 'lbfgs' solver: 0.789


***
# Ödev 8.1 - KNN (Proje 3)

In [16]:
from sklearn.neighbors import KNeighborsClassifier

In [19]:
%%time
parameters = {"n_neighbors":range(1,11,3), "weights":("uniform", "distance")}
knn1 = KNeighborsClassifier(n_jobs=-1)
grid_cv = GridSearchCV(estimator=knn1, param_grid=parameters, cv=5)
grid_cv.fit(x_train, y_train)
print("En iyi eğitim parametreleri : ", grid_cv.best_params_)
print("En iyi eğitim skoru         : ", grid_cv.best_score_)

En iyi eğitim parametreleri :  {'n_neighbors': 1, 'weights': 'uniform'}
En iyi eğitim skoru         :  0.4877912403645796
CPU times: user 7min 41s, sys: 13.4 s, total: 7min 54s
Wall time: 2h 7min 47s


In [20]:
%%time
grid_cv.fit(x_test, y_test)
print("En iyi test parametreleri : ", grid_cv.best_params_)
print("En iyi test skoru         : ", grid_cv.best_score_)

En iyi test parametreleri :  {'n_neighbors': 1, 'weights': 'uniform'}
En iyi test skoru         :  0.41732213179312155
CPU times: user 1min 24s, sys: 2.63 s, total: 1min 26s
Wall time: 9min 7s


In [21]:
results = pd.DataFrame(grid_cv.cv_results_)
results = results[["param_n_neighbors", "param_weights", "mean_test_score"]]
results = results.sort_values(by="mean_test_score", ascending=False)
results 

Unnamed: 0,param_n_neighbors,param_weights,mean_test_score
0,1,uniform,0.417322
1,1,distance,0.417322
3,4,distance,0.386585
5,7,distance,0.364391
7,10,distance,0.356193
4,7,uniform,0.34049
2,4,uniform,0.340148
6,10,uniform,0.326485


In [22]:
knn2 = KNeighborsRegressor(n_neighbors=1, n_jobs=-1)
knn2.fit(x_train, y_train)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
                    weights='uniform')

In [23]:
knn3 = KNeighborsRegressor(n_neighbors=1, n_jobs=-1)
knn3.fit(x_test, y_test)

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=-1, n_neighbors=1, p=2,
                    weights='uniform')

In [24]:
%%time
print("Train score:", knn2.score(x_train, y_train))
print("Test score:", knn3.score(x_test, y_test))
## seems overfit. why?

Train score: 0.9806746092278897
Test score: 0.9877906018600187
CPU times: user 12min 30s, sys: 395 ms, total: 12min 30s
Wall time: 8min 36s
