最长连续相同字符子串

In [1]:
def longest_consecutive_chars(s):
    if not s:
        return '', 0
    
    max_char = s[0]
    max_len = 1
    current_char = s[0]
    current_len = 1
    
    for i in range(1, len(s)):
        if s[i] == current_char:
            current_len += 1
            if current_len > max_len:
                max_len = current_len
                max_char = current_char
        else:
            current_char = s[i]
            current_len = 1
    
    return max_char, max_len

# 测试
print(longest_consecutive_chars("aaabbc"))  # 输出: ('a', 3)

('a', 3)


寻找Local Max Value

In [3]:
def find_local_max(arr, n):
    """
    找出所有local max value
    local max定义：在长度为2n+1的窗口中最大，
    且前n个元素严格递增，后n个元素严格递减
    """
    result = []
    length = len(arr)
    
    for i in range(n, length - n):
        window = arr[i-n:i+n+1]
        
        # 检查arr[i]是否为窗口最大值
        if arr[i] != max(window):
            continue
            
        # 检查前n个是否严格递增
        increasing = True
        for j in range(i-n+1, i+1):
            if arr[j] <= arr[j-1]:
                increasing = False
                break
                
        # 检查后n个是否严格递减
        decreasing = True
        for j in range(i+1, i+n+1):
            if arr[j] >= arr[j-1]:
                decreasing = False
                break
                
        if increasing and decreasing:
            result.append(arr[i])
    
    return result

# 测试
arr = [1, 3, 10, 4, 2, 19, 5, 5]
n = 2
print(find_local_max(arr, n))  # 输出: [10]

[10]


补全Bootstrap算法代码

bootstrap函数：有放回地随机抽取n个样本索引

In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from random import randint, seed

def bootstrap(n: int) -> list[int]:
    """
    Step 1: Bootstrap the train samples for each base classifier.
    """
    indices = [randint(0, n-1) for _ in range(n)]
    return indices

def fit(classifiers, x, y):
    """
    Step 2: Train each classifier based on its own bootstrapped samples.
    """
    n_samples = len(x)
    for clf in classifiers:
        indices = bootstrap(n_samples)
        x_bootstrapped = [x[i] for i in indices]
        y_bootstrapped = [y[i] for i in indices]
        clf.fit(x_bootstrapped, y_bootstrapped)

def predict(classifiers, x):
    """
    Step 3: Assign class labels by a majority vote of the base classifiers.
    """
    predictions = np.array([clf.predict(x) for clf in classifiers])
    # Majority vote
    final_predictions = np.apply_along_axis(
        lambda x: np.bincount(x).argmax(), 
        axis=0, 
        arr=predictions
    )
    return final_predictions.tolist()

def solution(x_train, y_train, x_test, n_estimators):
    """
    Step 4: Pull everything together
    """
    seed(42)
    classifiers = [DecisionTreeClassifier(random_state=0) 
                   for _ in range(n_estimators)]
    fit(classifiers, x_train, y_train)
    return predict(classifiers, x_test)

Naive Bayes实现

In [None]:
import numpy as np
from collections import defaultdict

class NaiveBayesClassifier:
    def __init__(self):
        self.class_priors = {}
        self.feature_likelihoods = defaultdict(list) # 存储每个类别下每个特征取值的概率
        
    def fit(self, X, y):
        # 计算先验概率 P(y)
        total_samples = len(y)
        classes, counts = np.unique(y, return_counts=True)
        for cls, cnt in zip(classes, counts):
            self.class_priors[cls] = cnt / total_samples
            
        # 计算似然概率 P(x|y) - 这里以离散特征为例
        # 实际面试中需根据题目要求处理连续特征（如假设高斯分布）
        for cls in classes:
            X_cls = X[y == cls]
            for feature_idx in range(X.shape[1]):
                feature_vals, val_counts = np.unique(X_cls[:, feature_idx], return_counts=True)
                probabilities = val_counts / len(X_cls)
                self.feature_likelihoods[(cls, feature_idx)] = dict(zip(feature_vals, probabilities))
                
    def predict(self, X):
        predictions = []
        for sample in X:
            posteriors = {}
            for cls, prior in self.class_priors.items():
                likelihood = 1.0
                for feature_idx, feature_val in enumerate(sample):
                    # 获取该特征值在给定类别下的概率，如果未见则使用平滑（如拉普拉斯平滑）
                    prob_dict = self.feature_likelihoods.get((cls, feature_idx), {})
                    likelihood *= prob_dict.get(feature_val, 1e-6) # 使用一个极小值做平滑
                posteriors[cls] = prior * likelihood
            predictions.append(max(posteriors, key=posteriors.get))
        return predictions

In [None]:
def fit(self, X, y, alpha=1):
    # ... 计算先验概率 ...
    
    for cls in classes:
        X_cls = X[y == cls]
        n_cls = len(X_cls)
        for feature_idx in range(X.shape[1]):
            # 计算每个特征值的出现次数
            feature_vals, val_counts = np.unique(X_cls[:, feature_idx], return_counts=True)
            # 获取该特征所有可能取值（来自整个训练集）
            all_vals = np.unique(X[:, feature_idx])
            V = len(all_vals)  # 可能取值数
            
            # 拉普拉斯平滑
            probabilities = {}
            for val, count in zip(feature_vals, val_counts):
                probabilities[val] = (count + alpha) / (n_cls + alpha * V)
            # 为未出现的值也分配概率
            for val in all_vals:
                if val not in probabilities:
                    probabilities[val] = alpha / (n_cls + alpha * V)
                    
            self.feature_likelihoods[(cls, feature_idx)] = probabilities

K-Means

In [None]:
import numpy as np
import matplotlib.pyplot as plt

def initialize_centroids(X, k):
    """Randomly initialize centroids from the dataset."""
    np.random.seed(42)
    random_indices = np.random.permutation(X.shape[0])
    centroids = X[random_indices[:k]]
    return centroids

def compute_distances(X, centroids):
    """Compute the distance between each data point and the centroids."""
    distances = np.zeros((X.shape[0], centroids.shape[0]))
    for i, centroid in enumerate(centroids):
        distances[:, i] = np.linalg.norm(X - centroid, axis=1)
    return distances

def assign_clusters(distances):
    """Assign each data point to the closest centroid."""
    return np.argmin(distances, axis=1)

def compute_centroids(X, labels, k):
    """Compute the new centroids as the mean of all data points assigned to each cluster."""
    centroids = np.zeros((k, X.shape[1]))
    for i in range(k):
        centroids[i, :] = X[labels == i].mean(axis=0)
    return centroids

def kmeans(X, k, max_iters=100):
    """K-means clustering algorithm."""
    centroids = initialize_centroids(X, k)
    
    for i in range(max_iters):
        old_centroids = centroids
        distances = compute_distances(X, centroids)
        labels = assign_clusters(distances)
        centroids = compute_centroids(X, labels, k)
        
        # If centroids do not change, we have converged
        if np.all(centroids == old_centroids):
            break
    
    return centroids, labels

# Example usage
if __name__ == "__main__":
    # Generate some synthetic data
    from sklearn.datasets import make_blobs
    X, _ = make_blobs(n_samples=300, centers=3, cluster_std=0.60, random_state=0)
    print(X.shape, X[:3])

    # Run the K-means algorithm
    k = 3
    centroids, labels = kmeans(X, k)

    # Visualize the results
    plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
    plt.scatter(centroids[:, 0], centroids[:, 1], s=300, c='red')  # Centroids
    plt.xlabel('Feature 1')
    plt.ylabel('Feature 2')
    plt.title('K-means Clustering')
    plt.show()

Linear Regression

In [None]:
class LinearRegressionGD:
    def __init__(self, learning_rate=0.01, n_iter=1000, 
                 fit_intercept=True, method='batch'):
        """
        learning_rate: 学习率
        n_iter: 迭代次数
        fit_intercept: 是否添加偏置项
        method: 'batch'（批量梯度下降）或 'sgd'（随机梯度下降）
        """
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.fit_intercept = fit_intercept
        self.method = method
        self.theta = None
        self.loss_history = []  # 记录损失变化
    
    def _add_intercept(self, X):
        """添加偏置项"""
        return np.c_[np.ones(X.shape[0]), X]
    
    def _compute_loss(self, X, y):
        """计算均方误差损失"""
        m = len(y)
        y_pred = X @ self.theta
        loss = (1/(2*m)) * np.sum((y_pred - y) ** 2)
        return loss
    
    def fit(self, X, y, verbose=False):
        """训练模型"""
        # 添加偏置项
        if self.fit_intercept:
            X = self._add_intercept(X)
        
        m, n = X.shape
        self.theta = np.zeros(n)  # 初始化参数
        
        for i in range(self.n_iter):
            if self.method == 'batch':
                # 批量梯度下降
                gradient = (1/m) * X.T @ (X @ self.theta - y)
                self.theta -= self.learning_rate * gradient
            
            elif self.method == 'sgd':
                # 随机梯度下降（随机选一个样本）
                idx = np.random.randint(m)
                x_i = X[idx:idx+1]  # 保持2D形状
                y_i = y[idx:idx+1]
                gradient = x_i.T @ (x_i @ self.theta - y_i)
                self.theta -= self.learning_rate * gradient
            
            # 记录损失（每100次迭代）
            if i % 100 == 0 or i == self.n_iter - 1:
                loss = self._compute_loss(X, y)
                self.loss_history.append((i, loss))
                if verbose:
                    print(f"Iteration {i}: loss = {loss:.6f}")
        
        return self
    
    def predict(self, X):
        """预测"""
        if self.fit_intercept:
            X = self._add_intercept(X)
        return X @ self.theta

Logistic Regression

In [None]:
import numpy as np

class LogisticRegression:
    def __init__(self, learning_rate=0.01, n_iter=1000, fit_intercept=True):
        self.learning_rate = learning_rate
        self.n_iter = n_iter
        self.fit_intercept = fit_intercept
        self.theta = None
    
    def _add_intercept(self, X):
        """添加偏置项"""
        intercept = np.ones((X.shape[0], 1))
        return np.hstack((intercept, X))
    
    def _sigmoid(self, z):
        """Sigmoid函数"""
        # 数值稳定性处理
        z = np.clip(z, -500, 500)
        return 1 / (1 + np.exp(-z))
    
    def fit(self, X, y):
        # 添加偏置项
        if self.fit_intercept:
            X = self._add_intercept(X)
        
        n_samples, n_features = X.shape
        self.theta = np.zeros(n_features)
        
        # 梯度下降
        for i in range(self.n_iter):
            # 计算预测值
            z = np.dot(X, self.theta)
            h = self._sigmoid(z)
            
            # 计算梯度
            gradient = np.dot(X.T, (h - y)) / n_samples
            
            # 更新参数
            self.theta -= self.learning_rate * gradient
            
            # 可选的：计算损失（用于监控）
            # loss = -np.mean(y * np.log(h + 1e-8) + (1-y) * np.log(1-h + 1e-8))
    
    def predict_proba(self, X):
        """预测概率"""
        if self.fit_intercept:
            X = self._add_intercept(X)
        return self._sigmoid(np.dot(X, self.theta))
    
    def predict(self, X, threshold=0.5):
        """预测类别"""
        proba = self.predict_proba(X)
        return (proba >= threshold).astype(int)

Decision Tree 原理与实现

In [None]:
import numpy as np

class DecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)
    
    def predict(self, X):
        return np.array([self._predict_one(x, self.tree) for x in X])
    
    # ========== 核心递归函数 ==========
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        # 停止条件
        if (depth >= self.max_depth or 
            n_samples < self.min_samples_split or 
            n_classes == 1):
            leaf_value = self._most_common_label(y)
            return {'type': 'leaf', 'value': leaf_value}
        
        # 寻找最佳分裂
        best_feat, best_thresh = self._best_split(X, y)
        if best_feat is None:
            leaf_value = self._most_common_label(y)
            return {'type': 'leaf', 'value': leaf_value}
        
        # 递归构建子树
        left_idx = X[:, best_feat] <= best_thresh
        right_idx = X[:, best_feat] > best_thresh
        
        left_subtree = self._grow_tree(X[left_idx], y[left_idx], depth+1)
        right_subtree = self._grow_tree(X[right_idx], y[right_idx], depth+1)
        
        return {
            'type': 'node',
            'feature': best_feat,
            'threshold': best_thresh,
            'left': left_subtree,
            'right': right_subtree
        }
    
    # ========== 关键辅助函数 ==========
    def _best_split(self, X, y):
        best_gain = -1
        split_feat, split_thresh = None, None
        
        for feat in range(X.shape[1]):
            thresholds = np.unique(X[:, feat])
            for thresh in thresholds:
                gain = self._information_gain(X, y, feat, thresh)
                if gain > best_gain:
                    best_gain = gain
                    split_feat, split_thresh = feat, thresh
        
        return split_feat, split_thresh
    
    def _information_gain(self, X, y, feat, thresh):
        # 计算基尼不纯度
        parent_gini = self._gini(y)
        
        left_idx = X[:, feat] <= thresh
        right_idx = X[:, feat] > thresh
        
        if len(y[left_idx]) == 0 or len(y[right_idx]) == 0:
            return 0
        
        n = len(y)
        n_left, n_right = len(y[left_idx]), len(y[right_idx])
        
        left_gini = self._gini(y[left_idx])
        right_gini = self._gini(y[right_idx])
        
        child_gini = (n_left/n) * left_gini + (n_right/n) * right_gini
        
        return parent_gini - child_gini
    
    def _gini(self, y):
        # 基尼不纯度: 1 - sum(p_i^2)
        n = len(y)
        if n == 0:
            return 0
        proportions = np.bincount(y) / n
        return 1 - np.sum(proportions ** 2)
    
    def _most_common_label(self, y):
        return np.bincount(y).argmax()
    
    def _predict_one(self, x, node):
        if node['type'] == 'leaf':
            return node['value']
        
        if x[node['feature']] <= node['threshold']:
            return self._predict_one(x, node['left'])
        else:
            return self._predict_one(x, node['right'])

# ========== 使用示例 ==========
if __name__ == "__main__":
    # 创建简单数据
    X = np.array([[1, 2], [2, 3], [3, 3], [6, 5], [7, 8], [8, 8]])
    y = np.array([0, 0, 0, 1, 1, 1])
    
    # 训练
    tree = DecisionTree(max_depth=3)
    tree.fit(X, y)
    
    # 预测
    X_test = np.array([[2.5, 3], [7.5, 7]])
    predictions = tree.predict(X_test)
    print(f"Predictions: {predictions}")  # 应输出 [0, 1]

In [None]:
import numpy as np
from collections import Counter

class DecisionTree:
    """单个决策树（简化版）"""
    def __init__(self, max_depth=5, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.tree = None
    
    def fit(self, X, y):
        self.tree = self._grow_tree(X, y)
    
    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))
        
        # 停止条件
        if (depth >= self.max_depth or 
            n_samples < self.min_samples_split or 
            n_classes == 1):
            return np.bincount(y).argmax()
        
        # 随机选择特征子集（√n_features）
        feat_indices = np.random.choice(n_features, int(np.sqrt(n_features)), replace=False)
        
        # 寻找最佳分裂
        best_gain = -1
        best_feat, best_thresh = None, None
        
        for feat in feat_indices:
            thresholds = np.unique(X[:, feat])
            for thresh in thresholds:
                left = y[X[:, feat] <= thresh]
                right = y[X[:, feat] > thresh]
                
                if len(left) == 0 or len(right) == 0:
                    continue
                
                gain = self._gini_gain(y, left, right)
                if gain > best_gain:
                    best_gain = gain
                    best_feat, best_thresh = feat, thresh
        
        if best_gain <= 0:
            return np.bincount(y).argmax()
        
        # 递归构建子树
        left_idx = X[:, best_feat] <= best_thresh
        right_idx = ~left_idx
        
        left_tree = self._grow_tree(X[left_idx], y[left_idx], depth+1)
        right_tree = self._grow_tree(X[right_idx], y[right_idx], depth+1)
        
        return (best_feat, best_thresh, left_tree, right_tree)
    
    def _gini_gain(self, parent, left, right):
        def gini(arr):
            if len(arr) == 0: return 0
            p = np.bincount(arr) / len(arr)
            return 1 - np.sum(p ** 2)
        
        n = len(parent)
        n_l, n_r = len(left), len(right)
        return gini(parent) - (n_l/n * gini(left) + n_r/n * gini(right))
    
    def predict_one(self, x, node):
        if not isinstance(node, tuple):  # 叶节点
            return node
        
        feat, thresh, left, right = node
        if x[feat] <= thresh:
            return self.predict_one(x, left)
        else:
            return self.predict_one(x, right)
    
    def predict(self, X):
        return np.array([self.predict_one(x, self.tree) for x in X])

RandomForest

In [None]:
class RandomForest:
    """随机森林主类"""
    def __init__(self, n_estimators=100, max_depth=10, min_samples_split=2):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.trees = []
    
    def fit(self, X, y):
        n_samples = X.shape[0]
        
        for i in range(self.n_estimators):
            # 1. Bootstrap采样（有放回随机抽样）
            indices = np.random.choice(n_samples, n_samples, replace=True)
            X_boot = X[indices]
            y_boot = y[indices]
            
            # 2. 训练决策树
            tree = DecisionTree(
                max_depth=self.max_depth,
                min_samples_split=self.min_samples_split
            )
            tree.fit(X_boot, y_boot)
            self.trees.append(tree)
            
            # 可选：打印进度
            if (i+1) % 20 == 0:
                print(f"Trained tree {i+1}/{self.n_estimators}")
    
    def predict(self, X):
        # 收集所有树的预测
        tree_preds = np.array([tree.predict(X) for tree in self.trees])
        
        # 多数投票（沿树的方向取众数）
        final_predictions = []
        for i in range(X.shape[0]):
            votes = tree_preds[:, i]
            # 使用Counter找众数
            most_common = Counter(votes).most_common(1)[0][0]
            final_predictions.append(most_common)
        
        return np.array(final_predictions)

TP = 80, FN = 20, FP = 15, TN = 85
Recall = TP / (TP + FN) = 80 / (80 + 20) = 0.8 或 80%
FPR = FP / (FP + TN) = FP / N

3 问题：如果验证集损失（validation loss）显著高于训练集损失（training loss），可能的原因是什么？

回答要点：这是过拟合（Overfitting） 的典型标志。模型过度学习了训练数据中的噪声和细节，导致在未见过的验证数据上泛化能力变差。处理方法包括：获取更多数据、进行数据增强、增加正则化（如L1/L2）、使用Dropout（对神经网络）、或简化模型复杂度。

4 问题：在处理分类问题时，如何有意识地增加模型的偏差（Bias）并减少方差（Variance）？

回答要点：这通常是为了解决过拟合（高方差）。方法包括：简化模型（如减少树的最大深度、减少神经网络层数）、增加正则化强度、减少特征数量、或使用Bagging类方法（如随机森林）来平均多个高方差模型的结果。

In [None]:
神经网络输出计算题
1. MLP输出计算
问题：

输入层：3个神经元，输入为 [1.0, 0.5, -0.2]

隐藏层：2个神经元，激活函数为线性

输出层：1个神经元，激活函数为sigmoid

权重：

W1 = [[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]

b1 = [0.1, 0.2]

W2 = [[0.7], [0.8]]

b2 = [0.3]

答案：

步骤1：计算隐藏层输出
h1 = 1.0*0.1 + 0.5*0.3 + (-0.2)*0.5 + 0.1 = 0.1 + 0.15 - 0.1 + 0.1 = 0.25
h2 = 1.0*0.2 + 0.5*0.4 + (-0.2)*0.6 + 0.2 = 0.2 + 0.2 - 0.12 + 0.2 = 0.48

步骤2：计算输出层输入
z = 0.25*0.7 + 0.48*0.8 + 0.3 = 0.175 + 0.384 + 0.3 = 0.859

步骤3：应用sigmoid
output = 1 / (1 + e^(-0.859)) ≈ 1 / (1 + 0.424) ≈ 1 / 1.424 ≈ 0.702

答案：0.702（保留三位小数）

In [12]:
h1 = 1.0*0.1 + 0.5*0.3 + (-0.2)*0.5 + 0.1
h2 = 1.0*0.2 + 0.5*0.4 + (-0.2)*0.6 + 0.2
z = 0.25*0.7 + 0.48*0.8 + 0.3
output = 1 / (1 + np.exp(-0.859))
h1, h2, z, output

(0.25, 0.48000000000000004, 0.859, np.float64(0.7024516833527672))

In [None]:
我现在有interview的一些资料，根据下面资料生成interview时的面试题和对应答案，

第二个OA: 70分钟，6道选择题+1道填空题＋3道code。
选择题基本上都是ML相关题，比如要你算recall，LDA和PCA区别，overfitting处理啥的。填空题是给你一个NN结构和input，你算最后output。最搞心态的是，前面都是linear function，最后输出层它搞一个sigmoid function，然后算出来x还是个小数。你告诉我这没计算器怎么算？最后结果要求小数点后三位。


第一道code：给一个数组和一个区间n，算有哪些local max value。比如[1,3,10,4,2,19,5,5]和区间n=2，10是一个local max value，因为[1,3,10,4,2]里10最大，然后10前面的数字严格单调递增，后面的数字严格单调递减（大于等于，小于等于都不行）。所以19不是。
第二道code：手写bootstrap算法，补全code块（不准对已有code任何改动）。给你多个sklearn的classifiers，x和y。输出经过majority voting后的predicted y。答案大概长这样：
from random import randint, seed
from sklearn.tree import DecisionTreeClassifier
import numpy as np

def bootstrap(n: int) -> list[int]:
    """
    Step 1: Bootstrap the train samples for each base classifier.
    """
    indices = [randint(0, n-1) for _ in range(n)]
    return indices

def fit(classifiers: list[DecisionTreeClassifier], x: list[list[float]], y: list[int]):
    """
    Step 2: Train each classifier based on its own bootstrapped samples.
    """
    n_samples = len(x)
    for clf in classifiers:
        indices = bootstrap(n_samples)
        x_bootstrapped = [x[i] for i in indices]
        y_bootstrapped = [y[i] for i in indices]
        clf.fit(x_bootstrapped, y_bootstrapped)

def predict(classifiers: list[DecisionTreeClassifier], x: list[list[float]]) -> list[int]:
    """
    Step 3: Assign class labels by a majority vote of the base classifiers.
    """
    predictions = np.array([clf.predict(x) for clf in classifiers])
    # Majority vote
    final_predictions = np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)
    return final_predictions.tolist()

def solution(x_train: list[list[float]], y_train: list[int], x_test: list[list[float]], n_estimators: int) -> list[int]:
    """
    Step 4: Pull everything together
    """
    seed(42)
    classifiers = [DecisionTreeClassifier(random_state=0) for _ in range(n_estimators)]
    fit(classifiers, x_train, y_train)
    return predict(classifiers, x_test)
复制代码
第三道code：手动实现Naive Bayes算法。到这我已经没时间了。


OA2


5 道选择
问Random Forest和XGBoost的区别
怎么increase bias and reduce variance
LDA和PCA分别的适用场景
validation loss is significantly higher than training loss，问有可能是什么原因
给了4个confusion matrices, 选出所有recall >= 90%同时FPR < 10%的



1道填空
一个有1个hidden layer的MLP，hidden activation是linear，output activation是sigmoid，给了input和所有weights，算output



3道coding
find the longest contiguous substring consisting of the same character：找到连续出现最多次的character，返回这个character和连续出现的次数
Bootstrap
Decision Tree

70分钟10题， CodeSignal Online Assessment。时间很紧，楼主也不记得原题了，请大家谅解。几个tips
1- coding基本写出来就不错了。时间太紧了，不要worry about optimality，除非test 没过再回来改。ML coding 的description非常的长，可能看起来会很花时间，所以要复习一下tree algorithm自己写一写，再去。可以先做coding，再回来做选择题。
2 - multiple choices 考ML fundamental 考的非常非常细。如果不是很confident就不要花太多时间纠结了。


7个Multiple Choices，抱歉不是全记得了，就写几个记得 ，TLDR 硬币基地很喜欢考各种tree algorithm，复习好了tree再去做这个tech screen

1 - naive bayes 和 knn的优劣（比如说curse of dimensionality, multi-colinearity 方面）
2- implement forward propagation for 3 layers of simple feedforward with linear/no activation function in first 2 layers and sigmoid in last layers. 这题有点变态，因为没说能让用calculator，楼主很诚实， 所以楼主纠结了一下sigmoid 估算。
3- emsemble algorithm的优劣（比如说是不是training extensive，是不是容易overfit）
4- random forest和GBT 的优劣 (比如说inference time，overfitting， difficult to train
5 - LDA 和PCA的优劣和区别


3个coding。
第一题coding非常简单，求string中longest consequtive sequence with identical characters 长度. 比如说aaabbc -> 3 。
第二题要求implement random forest里面的bootstrapping，bagging 的training和prediction。
第三道题楼主，没时间做了，也是implement random forest里面的其他的一些compoment
楼主的Multiple choices是随便答的因为没有复习，coding做到后面也没太多时间做了，总之没啥准备。希望想去的同学还是准备好再做这个。
楼主确实做的很烂，主要是真的很久没有用到各种tree algorithm了，但是recruiter说senior 这个算过，staff不算过。

70 分钟10道题
前几个都是MLE八股文， 还有算MLP output, 最后是两道coding, 一道 logistic regression, test case 全过， 第二道naive bayes, 没时间写了直接提交了。

70 分钟 10 道题目

1-7 是一些简单的ml basics, 问一些model的不同，怎么处理各种情况，比如overfitting
8 一个非常简单的leetcode，大概是找字符串里面最后一个最大的连续重复的substring
9 不让用numpy然后手写gradient descent，但是给的蛮简单的，是一个linear function然后迭代的formula也都给了，稍微注意一下matrix dimension就好了
10 最后一题也是手写，写kmeans，我只写到了更新cluster center，最后的main function没时间了， 但是最后也过了

OA2是mle coding+basics；有选择+填空+coding；影响最深的是有个填空题算loss，精确到小数点后三位… 还没给计算器，实在不知道这题的意义在哪里
coding是 decision tree，logistic regression和knn；里面给了些implementation然后让你填上剩下的、我觉得这种对我非常不友好，不如让我写一个…
还有就是之前看instruction说不用run，就没看test case是不是pass；太蠢了，太久没做这种OA了；不知道自己咋想的
反正fail了，move forward