In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, roc_curve

# <font color = 'red'> Task4.Exploratory Data Analysis (EDA). Introduction to classification.  </font>

## <font color = 'green'>1. Exploratory data analysis </font>

### <font color = 'green'> Example 1: Data loading and initial analysis. Using the example of telecom.csv </font>

In [None]:
data = pd.read_csv('telecom.csv')

In [None]:
data.head()

In [None]:
print(data.info())

In [None]:
data['Churn']=data['Churn'].astype('int64')

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
data.describe(include=['object', 'bool'])

In [None]:
data['Churn'].value_counts()

In [None]:
data['Churn'].value_counts(normalize = True)

In [None]:
abons_stat = pd.Series(data[data['Churn'] == 1].mean(numeric_only=True),name='abons')
disertirs_stat = pd.Series(data[data['Churn'] == 0].mean(numeric_only=True),name='disertirs')
ratio = pd.Series(disertirs_stat/abons_stat,name = 'ratio')

In [None]:
pd.concat([abons_stat,disertirs_stat,ratio],axis = 1).sort_values(by='ratio')

In [None]:
pd.crosstab(data['Churn'], data['International plan']) #для категориальных

In [None]:
pd.crosstab(data['Churn'], data['Voice mail plan'],normalize=True)

In [None]:
pd.crosstab(data['Churn'], data['International plan'],margins=True)

In [None]:
pd.crosstab(data['Churn'], data['Customer service calls'],margins=True)

In [None]:
sns.countplot(x='Customer service calls', hue='Churn', data=data);

In [None]:
data['Intensive_service_calls_Q'] = (data['Customer service calls'] > 3).astype('int')

pd.crosstab(data['Intensive_service_calls_Q'], data['Churn'], margins=True)

In [None]:
sns.countplot(x='Intensive_service_calls_Q', hue='Churn', data=data);

In [None]:
d = {'No' : 0, 'Yes' : 1}
data['International plan'] = data['International plan'].map(d)

In [None]:
data.head()

In [None]:
data['Voice mail plan'] = data['Voice mail plan'].map(d)

In [None]:
data.head()

In [None]:
pd.crosstab(data['Intensive_service_calls_Q'] & data['International plan'] , data['Churn'])

### <font color = 'green'> Example 2: Correlation. Using the example of telecom.csv </font>

In [None]:
corr_matrix = data.drop(['State', 'International plan', 'Voice mail plan',
                      'Area code'], axis=1).corr() #we leave only quantitative features

In [None]:
corr_matrix

In [None]:
sns.heatmap(corr_matrix,cmap="crest");

### <font color = 'green'> Example 3. Graphical data analysis. Using telecom.csv as an example </font>

In [None]:
features = list(set(data.columns) - set(['State', 'International plan', 'Voice mail plan',  'Area code',
                                      'Total day charge',   'Total eve charge',   'Total night charge',
                                        'Total intl charge', 'Churn','Intensive_service_calls_Q','Phone number']))
data[features].hist(figsize=(20,12));

In [None]:
sns.pairplot(data[['Customer service calls','Total day minutes','Total day charge','Total night calls','Churn',]], hue='Churn');

### <font color = 'green'> Example 4. Graphical data analysis. Using the example of Fischer's Irises</font>

In [None]:
iris = sns.load_dataset('iris')


In [None]:
iris.head()

In [None]:
sns.pairplot(iris, hue='species',height=1.5);
plt.show();

In [None]:
features = list(set(iris.columns)-set(['species']))

In [None]:
features

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(7, 4))

for idx, feat in  enumerate(features):
    sns.boxplot(x='species', y = feat, data = iris, ax=axes[idx // 2, idx % 2],hue='species')
    axes[idx // 2, idx % 2].set_xlabel('species')
    axes[idx // 2, idx % 2].set_ylabel(feat);
plt.show();

In [None]:
sns.boxplot(x='species', y = 'sepal_length', data = iris,hue='species');
plt.show();

In [None]:
iris[features].hist(figsize=(7,4));
plt.show();

 https://habr.com/ru/articles/248623/

In [None]:
def andrews_curve(x,theta):
  curve = list()
  for th in theta:
    x1 = x[0] / np.sqrt(2)
    x2 = x[1] * np.sin(th)
    x3 = x[2] * np.cos(th)
    x4 = x[3] * np.sin(2.*th)
    curve.append(x1+x2+x3+x4)
  return curve

accuracy = 1000
samples = iris.drop(['species'],axis=1).values
theta = np.linspace(-np.pi, np.pi, accuracy)

for s in samples[:20]: # setosa
  plt.plot(theta, andrews_curve(s, theta), 'r')

for s in samples[50:70]: # versicolor
  plt.plot(theta, andrews_curve(s ,theta), 'g')

for s in samples[100:120]: # virginica
  plt.plot(theta, andrews_curve(s, theta), 'b')

plt.xlim(-np.pi,np.pi)
plt.show()


<font color = 'red' size = 5>Task 1 </font>

1. Study examples of point work with features in the source data.

2. After working with the documentation and examples, select and prepare features for the famous [titanik dataset](https://www.kaggle.com/c/titanic/data) , as well as for the [operators](https://www.kaggle.com/datasets/mnassrib/telecom-churn-datasets) data
+ 2.1 Visualize the features. Determine their types.
+ 2.2 Remove anomalous data where necessary (using the simplest z-score and IQR methods).
+ 2.3 Identify important features using descriptive statistics and correlation.
+ 2.4 Fill in the blanks and work with the data at your own discretion.
+ 2.5 Save the prepared dataset to a new csv file.
   
        



[https://scikit-learn.org/stable/modules/preprocessing.html](https://scikit-learn.org/stable/modules/preprocessing.html)


### <font color = 'green'> Example 5. Working with text </font>

####  bag of words

In [None]:
from functools import reduce
import numpy as np

texts = [['i', 'love', 'my', 'mmf'],
         ['he', 'hate', 'my', 'mmf'],
         ['he', 'and', 'i', 'hate', 'and', 'love', 'my','mmf']]

dictionary = list(enumerate(set(list(reduce(lambda x, y: x + y, texts)))))

def vectorize(text):
    vector = np.zeros(len(dictionary))
    for i, word in dictionary:
        num = 0
        for w in text:
            if w == word:
                num += 1
        if num:
            vector[i] = num
    return vector

for t in texts:
    print(vectorize(t))

In [None]:
dictionary

#### Bag of words с N-grams

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(ngram_range=(1, 1))
vect.fit_transform(["no i have questions", "i have no questions"]).toarray()

In [None]:
vect.vocabulary_

In [None]:
vect = CountVectorizer(ngram_range=(1, 2))
vect.fit_transform(["no i have questions", "i have no questions"]).toarray()

In [None]:
vect.vocabulary_

#### TF- IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
Document1= "It is going to rain today."
Document2= "Today I am not going outside."
Document3= "I am going to watch the season premiere."
Doc = [Document1 ,
       Document2 , 
       Document3]
print(Doc)

In [None]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(Doc)

In [None]:
print('Document transform',X.toarray())

In [None]:
X.toarray()[0]

In [None]:
print(vectorizer.vocabulary_)

<font color = 'red' size = 5>Task 2 </font>

1. Study the example of working with a text dataset described above.

2. Prepare a text dataset of movie reviews for further work. [Imdb - rews](https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews) .
+ 2.1 Using the built-in capabilities of *TfidfVectorizer*.
+ 2.2 By implementing an analogue yourself.
+ 2.3 Save the prepared dataset to a new csv file.

[https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/](https://www.geeksforgeeks.org/understanding-tf-idf-term-frequency-inverse-document-frequency/)

## <font color = 'green'> 2. Introduction to classification.</font>

In [None]:
X_data = np.random.randint(0,40,(300,2))
rows = np.where(X_data[:,0]!=X_data[:,1])
X_data = X_data[rows]
y_data = np.array([int(np.sign(elem[0]-elem[1])==1) for elem in X_data])

plt.scatter(X_data[:,0],X_data[:,1],c = y_data);

inner_model = LogisticRegression(penalty=None)
inner_model.fit(X_data, y_data)

inner_preds = inner_model.predict(X_data)
inner_preds_pr = inner_model.predict_proba(X_data)
 # accuracy
(inner_preds == y_data).mean()

inner_preds_pr

print(inner_model.coef_)

inner_model.intercept_

<font color = 'red' size = 5>Task 3 </font>

Using built-in *python* capabilities, implement a simple classifier model for the data from Sec. 1 and Sec. 2. Draw conclusions about the accuracy using the *accurancy_score* metric.
