In [1]:
import pandas as pd
import numpy as np
import datetime

import sys
sys.path.append("/Users/derekdewald/Documents/Python/Github_Repo/d_py_functions")

google_notes_csv = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vSQF2lNc4WPeTRQ_VzWPkqSZp4RODFkbap8AqmolWp5bKoMaslP2oRVVG21x2POu_JcbF1tGRcBgodu/pub?output=csv'
google_definition_csv = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQq1-3cTas8DCWBa2NKYhVFXpl8kLaFDohg0zMfNTAU_Fiw6aIFLWfA5zRem4eSaGPa7UiQvkz05loW/pub?output=csv'

notes = pd.read_csv(google_notes_csv).fillna('')
notes = notes[['Category','Categorization','Word','Definition']].copy()

definitions = pd.read_csv(google_definition_csv).fillna('')

ml_def = definitions.copy()
ml_notes = notes[notes['Category'].isin(['Machine Learning','Visualizations'])]

from df_processing import final_dataset_for_markdown
d_learning_notes = final_dataset_for_markdown(notes,definitions)
d_learning_notes

Unnamed: 0,Category,Categorization,Word,Definition
0,Machine Learning,Definition,Machine Learning,Using a process to enable computers to iterati...
1,Machine Learning,Definition,"Bias, Fairness and Ethics",This step examines whether the model produces ...
2,Machine Learning,Definition,Data Collection,"Data collection involves identifying, sourcing..."
3,Machine Learning,Definition,Data Preparation,Data preparation focuses on cleaning and struc...
4,Machine Learning,Definition,Deployment,Deployment is the process of integrating the t...
...,...,...,...,...
388,Mathematics,Best Linear Unbiased Estimator,Requirements,Homoscedasticity: Variance of the residuals sh...
389,Mathematics,Best Linear Unbiased Estimator,Requirements,Independence: Observations should be independe...
390,Mathematics,Best Linear Unbiased Estimator,Requirements,Linearity: Relationship between Independent an...
391,Mathematics,Best Linear Unbiased Estimator,Requirements,No Perfect Collinearity: No perfect linear rel...


In [5]:

def final_dataset_for_markdown_V3(notes=None,
                                  definitions=None,
                                  export_location='/Users/derekdewald/Documents/Python/Github_Repo/Streamlit/DataDictionary/'):
    
    '''
    
    Function which helps to combined Notes and Definitions into a Single Combined Representation which can ultimately be used as a Learning Reference Tools.

    How this function Works:
    It takes the two sheets and attemps to Consolidate them together to make Final Dataset, generated as d_learning_notes.csv.

    Approach is to Take Notes As the Framework and Distribute All Information into the Notes Sequentially in Specific Order so i can 
    Understand the structure and how to continue and utilize the Sheet.

    Principles: 
        - Define all types of Records in Notes and then Move them Out Definiton By Definition
        - Worry about Order at End.

    Step 1: Insert Records where Categorization = Definition directly into Sheet.
            With Modification:
                Where Word is also A Categorization, Update Categorization from Definition to Word, Change Word to Definition, 
                and Updated Definition to include WORD:

    Step 2: Take Everything else. Should be nothing remaining from Notes Category, so need to update Category, by moving all information 
            to the Right 1 column and consolidating Definition.
            
    Parameters:
        notes(df): DataFrame of D Notes as stored in: https://docs.google.com/spreadsheets/d/e/2PACX-1vSQF2lNc4WPeTRQ_VzWPkqSZp4RODFkbap8AqmolWp5bKoMaslP2oRVVG21x2POu_JcbF1tGRcBgodu/pub?output=csv
        definitions(df): DataFrame of D Definitions as stored in: https://docs.google.com/spreadsheets/d/e/2PACX-1vQq1-3cTas8DCWBa2NKYhVFXpl8kLaFDohg0zMfNTAU_Fiw6aIFLWfA5zRem4eSaGPa7UiQvkz05loW/pub?output=csv
        export_location(str): Location of where to Save CSV File. If blank, no CSV is made.

    date_created:20-Dec-25
    date_last_modified: 21-Dec-25
    classification:TBD
    sub_classification:TBD
    usage:
        final_dataset_for_markdown()

    ##############

    Has been tested for a Single Value - Machine Learning. Need to Validate once extending.

    ##############

    
    '''

    from data_d_dicts import links

    try:
        len(notes)
    except:
        notes = pd.read_csv(links['google_notes_csv'])

    final_df = notes.copy()
    
    try:
        len(definitions)
    except:
        definitions = pd.read_csv(links['google_definition_csv'])

    temp_def = definitions[['Category','Categorization','Word','Definition']].copy()

    # Step 1
    # Create Definition only df and a Residual Definition DF res_def_df
    df_cat_is_definition = temp_def[temp_def['Categorization']=='Definition'].copy()

    
    
    cat_list = notes['Categorization'].unique().tolist()
    # When Word is also a Categorization, update so it is grouped correctly when sorting
    df_cat_is_definition['Categorization'] = np.where(df_cat_is_definition['Word'].isin(cat_list),df_cat_is_definition['Word'],df_cat_is_definition['Categorization'])
    
    mask = df_cat_is_definition["Categorization"] != "Definition"

    df_cat_is_definition.loc[mask, "Definition"] = (
        df_cat_is_definition.loc[mask, "Word"].astype(str)
        + ": "
        + df_cat_is_definition.loc[mask, "Definition"].astype(str)
    )

    df_cat_is_definition['Word'] = np.where(df_cat_is_definition['Categorization']!='Definition','Definition',df_cat_is_definition['Word'])
    res_def_df = temp_def[temp_def['Categorization']!='Definition'].copy()
    

    #Step 2
    
    def prepare_df_for_insert(df,notes=notes):
        df = df.copy()
        df['Definition'] = df['Word'] + ": " + df['Definition']
        df['Word'] = df["Categorization"].copy()
        df['Categorization'] = df["Category"].copy()
        df.drop('Category',axis=1,inplace=True)
        
        # Need to Identify Category. Should Primarily be from Categorization, might be from Word in Rare cases ("Regularization")
        df1 =  df[['Categorization']].drop_duplicates().merge(notes[['Category','Categorization']].drop_duplicates(),on='Categorization',how='left')    
        return df.merge(df1,on=['Categorization'],how='left')

    res_def_df = prepare_df_for_insert(res_def_df)
    
    # Insert 
    final_df = pd.concat([final_df,df_cat_is_definition,res_def_df])

    ################
    
    # Ranking for Sort ORder
    rank_df1 = notes.drop_duplicates('Category')[['Category']].reset_index(drop=True).reset_index().rename(columns={'index':"CY_RANK"})
    rank_df2 = notes.drop_duplicates(['Category','Categorization'])[['Category','Categorization']].reset_index(drop=True).reset_index().rename(columns={'index':"CZ_RANK"})
    rank_df2['CZ_RANK'] = rank_df2['CZ_RANK'] + 1
    rank_df3 = notes.drop_duplicates(['Category','Categorization','Word']).reset_index(drop=True).reset_index().drop('Definition',axis=1).rename(columns={'index':'WORD_RANK'})
    rank_df3['WORD_RANK'] = rank_df3['WORD_RANK'] + 1

    final_df = final_df.merge(rank_df1,on=['Category'],how='left').merge(rank_df2,on=['Category','Categorization'],how='left').merge(rank_df3,on=['Category','Categorization','Word'],how='left')
    final_df['CZ_RANK'] = np.where(final_df['Categorization']=='Definition',0,final_df['CZ_RANK'])
    final_df['WORD_RANK'] = np.where(final_df['Word']=='Definition',0,final_df['WORD_RANK'])
    final_df =  final_df.sort_values(['CY_RANK','CZ_RANK','WORD_RANK'])

    if export_location:
        final_df.to_csv(f"{export_location}d_learning_notes.csv",index=False)
    
    return final_df,df_cat_is_definition

d,e = final_dataset_for_markdown_V3(notes,definitions)
d
                      

Unnamed: 0,Category,Categorization,Word,Definition,CY_RANK,CZ_RANK,WORD_RANK
120,Machine Learning,Definition,Area Under the Curve,AUC (Area Under the Curve) measures a model’s ...,0.0,0.0,
125,Machine Learning,Definition,Autoregressive,Autoregressive models generate outputs one ste...,0.0,0.0,
126,Machine Learning,Definition,Bias,Bias refers to the systematic error in a model...,0.0,0.0,
127,Machine Learning,Definition,Bias - Variance Trade Off,The bias-variance trade-off is a key concept i...,0.0,0.0,
128,Machine Learning,Definition,Cross Validation,Model evaluation technique used to assess how ...,0.0,0.0,
...,...,...,...,...,...,...,...
147,Tool,Definition,Pytorch,PyTorch is an open-source deep learning framew...,,0.0,
148,Tool,Definition,Tensor Flow,TensorFlow is an open-source machine learning ...,,0.0,
149,Training,Definition,Activation Function,A mathematical function applied to a neuron’s ...,,0.0,
150,Training,Definition,Loss Function,A loss function is a mathematical function tha...,,0.0,


In [8]:
e

Unnamed: 0,Category,Categorization,Word,Definition
5,Machine Learning,"Bias, Fairness and Ethics",Definition,"Bias, Fairness and Ethics: This step examines ..."
6,Machine Learning,Data Collection,Definition,Data Collection: Data collection involves iden...
7,Machine Learning,Data Preparation,Definition,Data Preparation: Data preparation focuses on ...
8,Machine Learning,Deployment,Definition,Deployment: Deployment is the process of integ...
9,Machine Learning,Definition,Area Under the Curve,AUC (Area Under the Curve) measures a model’s ...
10,Machine Learning,Evaluation,Definition,Evaluation: Evaluation measures how well the f...
11,Machine Learning,Feature Engineering,Definition,Feature Engineering: Feature engineering is th...
12,Machine Learning,Feature Selection,Definition,Feature Selection: Feature selection involves ...
13,Machine Learning,Hyperparameter Tuning,Definition,Hyperparameter Tuning: Hyperparameter tuning i...
14,Machine Learning,Definition,Autoregressive,Autoregressive models generate outputs one ste...


In [7]:
d[d['Word']=='Regularization']

Unnamed: 0,Category,Categorization,Word,Definition,CY_RANK,CZ_RANK,WORD_RANK
143,Machine Learning,Definition,Regularization,Technique used to prevent overfitting by addin...,0.0,0.0,
44,Machine Learning,Model,Regularization,Automatically Included because it has a Record...,0.0,6.0,43.0
389,Machine Learning,Model,Regularization,ElasticNet: Approach which combines L1 and L2 ...,0.0,6.0,43.0
390,Machine Learning,Model,Regularization,Lasso: Technique used to prevent overfitting b...,0.0,6.0,43.0
391,Machine Learning,Model,Regularization,Ridge: Technique used to prevent overfitting b...,0.0,6.0,43.0


In [81]:
 def prepare_df_for_insert(df,notes=notes):
        df = df.copy()
        df['Definition'] = df['Word'] + ": " + df['Definition']
        df['Word'] = df["Categorization"].copy()
        df['Categorization'] = df["Category"].copy()
        df.drop('Category',axis=1,inplace=True)
        
        # Need to Identify Category. Should Primarily be from Categorization, might be from Word in Rare cases ("Regularization")
        df1 =  df[['Categorization']].drop_duplicates().merge(notes[['Category','Categorization']].drop_duplicates(),on='Categorization',how='left')
    
        for word in df1[df1['Category'].isnull()]['Categorization']:
            cat = notes[notes['Word']==word].iloc[0]['Category']
            df1['Category'] = np.where(df1['Categorization']==word,cat,df1['Category'])
        
        return df.merge(df1,on=['Categorization'],how='left')

f = prepare_df_for_insert(d)

f['Category'].value_counts()

Category
Machine Learning    237
Mathematics           5
Name: count, dtype: int64

In [32]:
# Identify what is NOT Included in Final
columns = ['Category','Categorization','Word','Definition']
test = definitions[columns].merge(d[columns],on=['Category','Categorization','Word'],how='left',indicator=True)

test1 = test[(test['Categorization']!='Definition')&(test['_merge']!='both')]
test1

Unnamed: 0,Category,Categorization,Word,Definition_x,Definition_y,_merge
0,Best Linear Unbiased Estimator,Requirements,Homoscedasticity,Variance of the residuals should be constant.,,left_only
1,Best Linear Unbiased Estimator,Requirements,Independence,Observations should be independent of each oth...,,left_only
2,Best Linear Unbiased Estimator,Requirements,Linearity,Relationship between Independent and Dependent...,,left_only
3,Best Linear Unbiased Estimator,Requirements,No Perfect Collinearity,No perfect linear relationship between residuals,,left_only
4,Best Linear Unbiased Estimator,Requirements,Normality of Residuals,Residuals should be normally distributed.,,left_only
...,...,...,...,...,...,...
268,Regularization,Constraint,Ridge,Technique used to prevent overfitting by addin...,,left_only
276,Validation,Function,Accuracy,Number of correctly classified examples divide...,,left_only
277,Validation,Procedure,Confusion Matrix,Table used to evaluate the performance of a cl...,,left_only
278,Validation,Procedure,K Fold Cross Validation,Application of Cross Validation. Method for in...,,left_only


In [None]:
# in theory there should be NO Category Records Left - Need to Check this somehow, 


In [34]:
test1['Category'].value_counts()

Category
Model                             221
Training                            9
Best Linear Unbiased Estimator      5
Validation                          4
Regularization                      3
Name: count, dtype: int64

In [None]:
# I want to Distribute the Information From Definitions into Notes, While Maintaining the SIMPLEST Structure Possible.
# So the approach is take the notes Framework. Make it Minimal, and merge Information in.
# Over Engineering the Process.


In [24]:
def search_df_word(df,word,columns=None,case_sensitive=False,print_=True):
    '''
    
    
    '''

    # If User does not define which columns they wish to search, search all.
    if not columns:
        columns = df.columns.tolist()

    final_dict = {}
    final_df = pd.DataFrame()

    for column in columns:
        match_df = df[df[column].fillna('').str.contains(word,case=case_sensitive)]
        if len(match_df)==0:
            final_dict[column] = 'No Matches'
        else:
            final_word = ""
            for word in match_df[column]:
                final_word += word + ","
        
            final_dict[column] = final_word
            final_df = pd.concat([final_df,match_df])

    if print_:
        try:
            print(final_df)
        except:
            pass
            
    return final_df, final_dict

#final_df,final_dict = search_df_word(d_learning_notes,'regularization')
final_df,final_dict = search_df_word(d_learning_notes,'validation')

    

             Category Categorization                   Word  \
307  Machine Learning     Validation             Definition   
308  Machine Learning     Validation                   Goal   
309  Machine Learning     Validation               Approach   
310  Machine Learning     Validation  Important to Remember   
311  Machine Learning     Validation          Lesson Learnt   
312  Machine Learning     Validation              Algorithm   
313  Machine Learning     Validation               Function   
314  Machine Learning     Validation              Procedure   
315  Machine Learning     Validation              Procedure   
316  Machine Learning     Validation              Procedure   
317  Machine Learning     Validation                    TBD   
5    Machine Learning     Definition       Cross Validation   
315  Machine Learning     Validation              Procedure   
316  Machine Learning     Validation              Procedure   

                                            Definition

### Examples of How Information is Included

In [87]:
def d_learning_notes_guide(example_word_dict={},
                           df=d_learning_notes,
                           notes=notes,
                           definitions=definitions):

    if len(example_word_dict)==0:
        example_word_dict= {'Category':'Machine Learning',
                            'Categorization':'Model',
                            'Word':'Bias - Variance Trade Off',
                            'Definition':'Linear Regression'}
    
    for key,word in example_word_dict.items():
        print('\n############################### New Word ###############################\n')
        print(f'Word: {word}')
        print(f'Word Classification: {key}')
    
        for key,data_set in {'notes':notes,'definitions':definitions,'d_learning_notes':d_learning_notes}.items():
    
            print(f"\nInclusion in {key}:")
            for column in ['Category','Categorization','Word','Definition']:
                try:
                    temp = data_set[data_set[column]==word][['Category','Categorization','Word','Definition']].copy()
                except:
                    temp = data_set[data_set[column]==word].copy()
                
                print(f'\nAs Word in {column}')
                
                if len(temp)>0:    
                    display(temp.head(10))
                else:
                    print(f'No Records Found')
                
d_learning_notes_guide()


############################### New Word ###############################

Word: Machine Learning
Word Classification: Category

Inclusion in notes:

As Word in Category


Unnamed: 0,Category,Categorization,Word,Definition
0,Machine Learning,Problem Definition,Goal,Need this to Pull from Somewhere Else.
1,Machine Learning,Problem Definition,Approach,
2,Machine Learning,Problem Definition,Important to Remeber,
3,Machine Learning,Problem Definition,Lesson Learnt,
4,Machine Learning,Problem Definition,Algorithm,
5,Machine Learning,Problem Definition,Function,
6,Machine Learning,Problem Definition,Procedure,
7,Machine Learning,Problem Definition,TBD,
8,Machine Learning,Data Collection,Goal,Need this to Pull from Somewhere Else.
9,Machine Learning,Data Collection,Approach,



As Word in Categorization
No Records Found

As Word in Word
No Records Found

As Word in Definition
No Records Found

Inclusion in definitions:

As Word in Category


Unnamed: 0,Category,Categorization,Word,Definition
0,Machine Learning,Definition,Area Under the Curve,AUC (Area Under the Curve) measures a model’s ...
1,Machine Learning,Definition,Autoregressive,Autoregressive models generate outputs one ste...
2,Machine Learning,Definition,Bias,Bias refers to the systematic error in a model...
3,Machine Learning,Definition,Bias - Variance Trade Off,The bias-variance trade-off is a key concept i...
4,Machine Learning,Definition,Cross Validation,Model evaluation technique used to assess how ...
5,Machine Learning,Definition,Curse of Dimensionality,"As dimensionality grows, data points become in..."
6,Machine Learning,Definition,Deep Learning,Deep learning is a subset of machine learning ...
7,Machine Learning,Definition,Machine Learning,Using a process to enable computers to iterati...
8,Machine Learning,Definition,Optimization,Optimization is the process of finding the bes...
9,Machine Learning,Definition,Reinforcement Learning,Reinforcement learning trains an agent to make...



As Word in Categorization
No Records Found

As Word in Word


Unnamed: 0,Category,Categorization,Word,Definition
7,Machine Learning,Definition,Machine Learning,Using a process to enable computers to iterati...



As Word in Definition
No Records Found

Inclusion in d_learning_notes:

As Word in Category


Unnamed: 0,Category,Categorization,Word,Definition
0,Machine Learning,Definition,Machine Learning,Using a process to enable computers to iterati...
1,Machine Learning,Definition,Area Under the Curve,AUC (Area Under the Curve) measures a model’s ...
2,Machine Learning,Definition,Autoregressive,Autoregressive models generate outputs one ste...
3,Machine Learning,Definition,Bias,Bias refers to the systematic error in a model...
4,Machine Learning,Definition,Bias - Variance Trade Off,The bias-variance trade-off is a key concept i...
5,Machine Learning,Definition,Cross Validation,Model evaluation technique used to assess how ...
6,Machine Learning,Definition,Curse of Dimensionality,"As dimensionality grows, data points become in..."
7,Machine Learning,Definition,Deep Learning,Deep learning is a subset of machine learning ...
8,Machine Learning,Definition,Optimization,Optimization is the process of finding the bes...
9,Machine Learning,Definition,Reinforcement Learning,Reinforcement learning trains an agent to make...



As Word in Categorization
No Records Found

As Word in Word


Unnamed: 0,Category,Categorization,Word,Definition
0,Machine Learning,Definition,Machine Learning,Using a process to enable computers to iterati...



As Word in Definition
No Records Found

############################### New Word ###############################

Word: Model
Word Classification: Categorization

Inclusion in notes:

As Word in Category
No Records Found

As Word in Categorization


Unnamed: 0,Category,Categorization,Word,Definition
42,Machine Learning,Model,Goal,
43,Machine Learning,Model,Approach,
44,Machine Learning,Model,Regularization,Automatically Included because it has a Record...
45,Machine Learning,Model,Important to Remeber,
46,Machine Learning,Model,Lesson Learnt,
47,Machine Learning,Model,Algorithm,
48,Machine Learning,Model,Function,
49,Machine Learning,Model,Procedure,
50,Machine Learning,Model,Constraint,



As Word in Word
No Records Found

As Word in Definition
No Records Found

Inclusion in definitions:

As Word in Category


Unnamed: 0,Category,Categorization,Word,Definition
18,Model,Algorithm,Ada,Boosting technique that combines multiple weak...
19,Model,Algorithm,AdaBoostClassifier,
20,Model,Algorithm,AdaBoostRegressor,
21,Model,Algorithm,AffinityPropagation,
22,Model,Algorithm,Agentic AI,"Perform a Specific Task, Autonomously while le..."
23,Model,Algorithm,AgglomerativeClustering,Agglomerative Clustering is a type of hierarch...
24,Model,Algorithm,ARDRegression,
25,Model,Algorithm,BaggingClassifier,
26,Model,Algorithm,BaggingRegressor,
27,Model,Algorithm,BayesianGaussianMixture,



As Word in Categorization
No Records Found

As Word in Word
No Records Found

As Word in Definition
No Records Found

Inclusion in d_learning_notes:

As Word in Category
No Records Found

As Word in Categorization


Unnamed: 0,Category,Categorization,Word,Definition
61,Machine Learning,Model,Definition,Not Defined
62,Machine Learning,Model,Goal,
63,Machine Learning,Model,Approach,
64,Machine Learning,Model,Regularization,Automatically Included because it has a Record...
65,Machine Learning,Model,Important to Remeber,
66,Machine Learning,Model,Lesson Learnt,
67,Machine Learning,Model,Algorithm,Ada: Boosting technique that combines multiple...
68,Machine Learning,Model,Algorithm,AdaBoostClassifier:
69,Machine Learning,Model,Algorithm,AdaBoostRegressor:
70,Machine Learning,Model,Algorithm,AffinityPropagation:



As Word in Word
No Records Found

As Word in Definition
No Records Found

############################### New Word ###############################

Word: Bias - Variance Trade Off
Word Classification: Word

Inclusion in notes:

As Word in Category
No Records Found

As Word in Categorization
No Records Found

As Word in Word
No Records Found

As Word in Definition
No Records Found

Inclusion in definitions:

As Word in Category
No Records Found

As Word in Categorization
No Records Found

As Word in Word


Unnamed: 0,Category,Categorization,Word,Definition
3,Machine Learning,Definition,Bias - Variance Trade Off,The bias-variance trade-off is a key concept i...



As Word in Definition
No Records Found

Inclusion in d_learning_notes:

As Word in Category
No Records Found

As Word in Categorization
No Records Found

As Word in Word


Unnamed: 0,Category,Categorization,Word,Definition
4,Machine Learning,Definition,Bias - Variance Trade Off,The bias-variance trade-off is a key concept i...



As Word in Definition
No Records Found

############################### New Word ###############################

Word: Linear Regression
Word Classification: Definition

Inclusion in notes:

As Word in Category
No Records Found

As Word in Categorization
No Records Found

As Word in Word
No Records Found

As Word in Definition
No Records Found

Inclusion in definitions:

As Word in Category
No Records Found

As Word in Categorization
No Records Found

As Word in Word


Unnamed: 0,Category,Categorization,Word,Definition
109,Model,Algorithm,Linear Regression,Models the relationship between an independent...



As Word in Definition
No Records Found

Inclusion in d_learning_notes:

As Word in Category
No Records Found

As Word in Categorization
No Records Found

As Word in Word
No Records Found

As Word in Definition
No Records Found


In [None]:
def definition_dq(df,exclude_rule1=['Machine Learning']):
    
    '''
    
    
    '''
    
    df = df.copy()
    
    # Rule 1: Word and Category Can not Equal
    df['Rule1'] = np.where((df['Category']==df['Word'])&(~df['Category'].isin(exclude_rule1)),1,0)
    
    # Rule 2: There should be No Duplication in Word Column
    df['Rule2'] = df.groupby('Word').transform('size')
    df['Rule2'] = np.where(df['Rule2']>1,1,0)

    # Rule 3: All Categories in MUST have a record in Categorization.
    
    
    
    # Identify Rule Violations
    col_list =  [x for x in df.columns if x.find('Rule')!=-1]
    df['Violation'] = df[col_list].sum(axis=1)
    df['Violation'] = np.where(df['Violation']>0,1,0)

    
        
    return df.drop(['Notes','Link','Image','Markdown Equation','Dataset Size','Learning Type','Algorithm Class'],axis=1)

test = definition_dq(definitions)

test[test['Violation']==1]