In [7]:
# import libraries
import numpy as np
import scipy.stats as stats

In [8]:
# descriptive statistcs
data= np.array([10,10,20,30,40,50])
mean = np.mean(data)
median = np.median(data)
mode =stats.mode(data)
variance = np.var(data)

print(mean)
print(median)
print(mode)
print(variance)


26.666666666666668
25.0
ModeResult(mode=10, count=2)
222.2222222222222


In [9]:
# inferential statistics 
confidence_level = 0.95
degrees_freedom = len(data)-1
confidence_interval = stats.t.interval(confidence_level,degrees_freedom,mean,stats.sem(data))

print(confidence_interval)


(9.529454422568406, 43.80387891076493)


In [10]:
# hypothesis testing 
np.random.seed(42)
groupA = np.random.normal(50,10,100)
groupB = np.random.normal(53,10,100)
# groupC = np.random.normal(51,10,100)

t_stat,p_value = stats.ttest_ind(groupA,groupB)
# f_value,p_value = stats.f_oneway(groupA,groupB,groupC)

print(t_stat)
print(p_value)


-3.2359903436078983
0.0014208821931449166


In [11]:
# Data wrangling and preprocessing 
import pandas as pd

In [12]:
data1 = {
    'Age':[25,26,np.nan,29,22,34],
    'Salary':[5000,5400,5900,None,5800,6000],
    'Gender':['Male','Female','Male','Female','Male','Female'],
    'Years_at_company':[1,2,3,np.nan,2,8]
}

df = pd.DataFrame(data1)
print(df)
              

    Age  Salary  Gender  Years_at_company
0  25.0  5000.0    Male               1.0
1  26.0  5400.0  Female               2.0
2   NaN  5900.0    Male               3.0
3  29.0     NaN  Female               NaN
4  22.0  5800.0    Male               2.0
5  34.0  6000.0  Female               8.0


In [13]:
# missing values
missing = df.isnull().sum()
missing

Age                 1
Salary              1
Gender              0
Years_at_company    1
dtype: int64

In [14]:
# use simple imputer from sklearn to handle missing values 
from sklearn.impute import SimpleImputer
num_imputer = SimpleImputer(strategy='median')
df['Age'] = num_imputer.fit_transform(df[['Age']])
df['Years_at_company'] = num_imputer.fit_transform(df[['Years_at_company']])

In [15]:
# impute categorical variable 
cat_imputer = SimpleImputer(strategy='most_frequent')
df['Salary'] = cat_imputer.fit_transform(df[['Salary']])

In [16]:
df

Unnamed: 0,Age,Salary,Gender,Years_at_company
0,25.0,5000.0,Male,1.0
1,26.0,5400.0,Female,2.0
2,26.0,5900.0,Male,3.0
3,29.0,5000.0,Female,2.0
4,22.0,5800.0,Male,2.0
5,34.0,6000.0,Female,8.0


In [17]:
# Feature engineering
df['Age_years_at_company_interaction'] = df['Age'] *df['Years_at_company']
df['Age_years_at_company_interaction']


0     25.0
1     52.0
2     78.0
3     58.0
4     44.0
5    272.0
Name: Age_years_at_company_interaction, dtype: float64

In [18]:
df1 = pd.DataFrame({'Employee':['John','Ann'],'Dept':['Finance','data']})
df2 = pd.DataFrame({'Employee':['John','Ann'],'Project':['Budget','Models']})

In [19]:
# merging dataframes
merge_df = pd.merge(df1,df2,on='Employee',how='left')
pivot_table = df1.pivot(index='Employee',columns='Dept',values='Dept')
pivot_table.fillna(0,inplace=True)
merge_df

Unnamed: 0,Employee,Dept,Project
0,John,Finance,Budget
1,Ann,data,Models


In [21]:
# Machine learning 
!pip install scikit-learn



In [34]:
# Logistic Regression 
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [35]:
data = load_iris()
data

{'data': array([[5.1, 3.5, 1.4, 0.2],
        [4.9, 3. , 1.4, 0.2],
        [4.7, 3.2, 1.3, 0.2],
        [4.6, 3.1, 1.5, 0.2],
        [5. , 3.6, 1.4, 0.2],
        [5.4, 3.9, 1.7, 0.4],
        [4.6, 3.4, 1.4, 0.3],
        [5. , 3.4, 1.5, 0.2],
        [4.4, 2.9, 1.4, 0.2],
        [4.9, 3.1, 1.5, 0.1],
        [5.4, 3.7, 1.5, 0.2],
        [4.8, 3.4, 1.6, 0.2],
        [4.8, 3. , 1.4, 0.1],
        [4.3, 3. , 1.1, 0.1],
        [5.8, 4. , 1.2, 0.2],
        [5.7, 4.4, 1.5, 0.4],
        [5.4, 3.9, 1.3, 0.4],
        [5.1, 3.5, 1.4, 0.3],
        [5.7, 3.8, 1.7, 0.3],
        [5.1, 3.8, 1.5, 0.3],
        [5.4, 3.4, 1.7, 0.2],
        [5.1, 3.7, 1.5, 0.4],
        [4.6, 3.6, 1. , 0.2],
        [5.1, 3.3, 1.7, 0.5],
        [4.8, 3.4, 1.9, 0.2],
        [5. , 3. , 1.6, 0.2],
        [5. , 3.4, 1.6, 0.4],
        [5.2, 3.5, 1.5, 0.2],
        [5.2, 3.4, 1.4, 0.2],
        [4.7, 3.2, 1.6, 0.2],
        [4.8, 3.1, 1.6, 0.2],
        [5.4, 3.4, 1.5, 0.4],
        [5.2, 4.1, 1.5, 0.1],
  

In [36]:
x=data.data
y=data.target

In [37]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [38]:
Classifier = LogisticRegression(max_iter=200)
Classifier.fit(x_train,y_train)

In [39]:
predictions = Classifier.predict(x_test)
predictions

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0])

In [42]:
accuracy = accuracy_score(y_test,predictions)
print('Accuracy:',accuracy*100)

Accuracy: 100.0


In [43]:
# Deep Learning
!pip install tensorflow

Collecting tensorflow
  Downloading tensorflow-2.17.0-cp311-cp311-win_amd64.whl.metadata (3.2 kB)
Collecting tensorflow-intel==2.17.0 (from tensorflow)
  Downloading tensorflow_intel-2.17.0-cp311-cp311-win_amd64.whl.metadata (5.0 kB)
Collecting absl-py>=1.0.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading absl_py-2.1.0-py3-none-any.whl.metadata (2.3 kB)
Collecting astunparse>=1.6.0 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading astunparse-1.6.3-py2.py3-none-any.whl.metadata (4.4 kB)
Collecting flatbuffers>=24.3.25 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading flatbuffers-24.3.25-py2.py3-none-any.whl.metadata (850 bytes)
Collecting gast!=0.5.0,!=0.5.1,!=0.5.2,>=0.2.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading gast-0.6.0-py3-none-any.whl.metadata (1.3 kB)
Collecting google-pasta>=0.1.1 (from tensorflow-intel==2.17.0->tensorflow)
  Downloading google_pasta-0.2.0-py3-none-any.whl.metadata (814 bytes)
Collecting h5py>=3.10.0 (from tensorflow-

In [55]:
import tensorflow as tf
from keras.models import Sequential
from tensorflow.keras.layers import Dense

# data
x= np.random.rand(100,10)
y=np.random.randint(2,size=(100,1))
# Define Sequential Model
model = Sequential([
    Dense(10,activation='relu',input_shape=(10,)),
    Dense(1,activation='sigmoid')
    ])
# Compile your model 
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

model.fit(x,y,epochs=100)

# Evaluate
loss,accuracy = model.evaluate(x,y)
print(f'Loss:{loss},Accuracy:{accuracy}')

Epoch 1/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 8ms/step - accuracy: 0.5516 - loss: 0.7170
Epoch 2/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5966 - loss: 0.6858 
Epoch 3/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5869 - loss: 0.6961 
Epoch 4/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.5582 - loss: 0.6933 
Epoch 5/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5415 - loss: 0.7008 
Epoch 6/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5778 - loss: 0.6932 
Epoch 7/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5455 - loss: 0.7036 
Epoch 8/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5914 - loss: 0.6900 
Epoch 9/100
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37